| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425 |
- """
- This file contains a minimal set of tests for compliance with the extension
- array interface test suite, and should contain no other tests.
- The test suite for the full functionality of the array is located in
- `pandas/tests/arrays/`.
- The tests in this file are inherited from the BaseExtensionTests, and only
- minimal tweaks should be applied to get the tests passing (by overwriting a
- parent method).
- Additional tests should either be added to one of the BaseExtensionTests
- classes (if they are relevant for the extension interface for all dtypes), or
- be added to the array-specific tests in `pandas/tests/arrays/`.
- """
- from __future__ import annotations
- from datetime import (
- date,
- datetime,
- time,
- timedelta,
- )
- from decimal import Decimal
- from io import (
- BytesIO,
- StringIO,
- )
- import operator
- import pickle
- import re
- import numpy as np
- import pytest
- from pandas._libs import lib
- from pandas._libs.tslibs import timezones
- from pandas.compat import (
- PY311,
- PY312,
- is_ci_environment,
- is_platform_windows,
- pa_version_under11p0,
- pa_version_under13p0,
- pa_version_under14p0,
- pa_version_under20p0,
- pa_version_under21p0,
- )
- from pandas.core.dtypes.dtypes import (
- ArrowDtype,
- CategoricalDtypeType,
- )
- import pandas as pd
- import pandas._testing as tm
- from pandas.api.extensions import no_default
- from pandas.api.types import (
- is_bool_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_numeric_dtype,
- is_signed_integer_dtype,
- is_string_dtype,
- is_unsigned_integer_dtype,
- )
- from pandas.tests.extension import base
- pa = pytest.importorskip("pyarrow")
- from pandas.core.arrays.arrow.array import ArrowExtensionArray
- from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
- def _require_timezone_database(request):
- if is_platform_windows() and is_ci_environment():
- mark = pytest.mark.xfail(
- raises=pa.ArrowInvalid,
- reason=(
- "TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
- "on CI to path to the tzdata for pyarrow."
- ),
- )
- request.applymarker(mark)
- @pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str)
- def dtype(request):
- return ArrowDtype(pyarrow_dtype=request.param)
- @pytest.fixture
- def data(dtype):
- pa_dtype = dtype.pyarrow_dtype
- if pa.types.is_boolean(pa_dtype):
- data = [True, False] * 4 + [None] + [True, False] * 44 + [None] + [True, False]
- elif pa.types.is_floating(pa_dtype):
- data = [1.0, 0.0] * 4 + [None] + [-2.0, -1.0] * 44 + [None] + [0.5, 99.5]
- elif pa.types.is_signed_integer(pa_dtype):
- data = [1, 0] * 4 + [None] + [-2, -1] * 44 + [None] + [1, 99]
- elif pa.types.is_unsigned_integer(pa_dtype):
- data = [1, 0] * 4 + [None] + [2, 1] * 44 + [None] + [1, 99]
- elif pa.types.is_decimal(pa_dtype):
- data = (
- [Decimal("1"), Decimal("0.0")] * 4
- + [None]
- + [Decimal("-2.0"), Decimal("-1.0")] * 44
- + [None]
- + [Decimal("0.5"), Decimal("33.123")]
- )
- elif pa.types.is_date(pa_dtype):
- data = (
- [date(2022, 1, 1), date(1999, 12, 31)] * 4
- + [None]
- + [date(2022, 1, 1), date(2022, 1, 1)] * 44
- + [None]
- + [date(1999, 12, 31), date(1999, 12, 31)]
- )
- elif pa.types.is_timestamp(pa_dtype):
- data = (
- [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 4
- + [None]
- + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 44
- + [None]
- + [datetime(2020, 1, 1), datetime(1999, 1, 1)]
- )
- elif pa.types.is_duration(pa_dtype):
- data = (
- [timedelta(1), timedelta(1, 1)] * 4
- + [None]
- + [timedelta(-1), timedelta(0)] * 44
- + [None]
- + [timedelta(-10), timedelta(10)]
- )
- elif pa.types.is_time(pa_dtype):
- data = (
- [time(12, 0), time(0, 12)] * 4
- + [None]
- + [time(0, 0), time(1, 1)] * 44
- + [None]
- + [time(0, 5), time(5, 0)]
- )
- elif pa.types.is_string(pa_dtype):
- data = ["a", "b"] * 4 + [None] + ["1", "2"] * 44 + [None] + ["!", ">"]
- elif pa.types.is_binary(pa_dtype):
- data = [b"a", b"b"] * 4 + [None] + [b"1", b"2"] * 44 + [None] + [b"!", b">"]
- else:
- raise NotImplementedError
- return pd.array(data, dtype=dtype)
- @pytest.fixture
- def data_missing(data):
- """Length-2 array with [NA, Valid]"""
- return type(data)._from_sequence([None, data[0]], dtype=data.dtype)
- @pytest.fixture(params=["data", "data_missing"])
- def all_data(request, data, data_missing):
- """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
- Used to test dtype conversion with and without missing values.
- """
- if request.param == "data":
- return data
- elif request.param == "data_missing":
- return data_missing
- @pytest.fixture
- def data_for_grouping(dtype):
- """
- Data for factorization, grouping, and unique tests.
- Expected to be like [B, B, NA, NA, A, A, B, C]
- Where A < B < C and NA is missing
- """
- pa_dtype = dtype.pyarrow_dtype
- if pa.types.is_boolean(pa_dtype):
- A = False
- B = True
- C = True
- elif pa.types.is_floating(pa_dtype):
- A = -1.1
- B = 0.0
- C = 1.1
- elif pa.types.is_signed_integer(pa_dtype):
- A = -1
- B = 0
- C = 1
- elif pa.types.is_unsigned_integer(pa_dtype):
- A = 0
- B = 1
- C = 10
- elif pa.types.is_date(pa_dtype):
- A = date(1999, 12, 31)
- B = date(2010, 1, 1)
- C = date(2022, 1, 1)
- elif pa.types.is_timestamp(pa_dtype):
- A = datetime(1999, 1, 1, 1, 1, 1, 1)
- B = datetime(2020, 1, 1)
- C = datetime(2020, 1, 1, 1)
- elif pa.types.is_duration(pa_dtype):
- A = timedelta(-1)
- B = timedelta(0)
- C = timedelta(1, 4)
- elif pa.types.is_time(pa_dtype):
- A = time(0, 0)
- B = time(0, 12)
- C = time(12, 12)
- elif pa.types.is_string(pa_dtype):
- A = "a"
- B = "b"
- C = "c"
- elif pa.types.is_binary(pa_dtype):
- A = b"a"
- B = b"b"
- C = b"c"
- elif pa.types.is_decimal(pa_dtype):
- A = Decimal("-1.1")
- B = Decimal("0.0")
- C = Decimal("1.1")
- else:
- raise NotImplementedError
- return pd.array([B, B, None, None, A, A, B, C], dtype=dtype)
- @pytest.fixture
- def data_for_sorting(data_for_grouping):
- """
- Length-3 array with a known sort order.
- This should be three items [B, C, A] with
- A < B < C
- """
- return type(data_for_grouping)._from_sequence(
- [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]],
- dtype=data_for_grouping.dtype,
- )
- @pytest.fixture
- def data_missing_for_sorting(data_for_grouping):
- """
- Length-3 array with a known sort order.
- This should be three items [B, NA, A] with
- A < B and NA missing.
- """
- return type(data_for_grouping)._from_sequence(
- [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]],
- dtype=data_for_grouping.dtype,
- )
- @pytest.fixture
- def data_for_twos(data):
- """Length-100 array in which all the elements are two."""
- pa_dtype = data.dtype.pyarrow_dtype
- if (
- pa.types.is_integer(pa_dtype)
- or pa.types.is_floating(pa_dtype)
- or pa.types.is_decimal(pa_dtype)
- or pa.types.is_duration(pa_dtype)
- ):
- return pd.array([2] * 100, dtype=data.dtype)
- # tests will be xfailed where 2 is not a valid scalar for pa_dtype
- return data
- # TODO: skip otherwise?
- class TestArrowArray(base.ExtensionTests):
- def test_compare_scalar(self, data, comparison_op):
- ser = pd.Series(data)
- self._compare_other(ser, data, comparison_op, data[0])
- @pytest.mark.parametrize("na_action", [None, "ignore"])
- def test_map(self, data_missing, na_action):
- if data_missing.dtype.kind in "mM":
- result = data_missing.map(lambda x: x, na_action=na_action)
- expected = data_missing.to_numpy(dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- else:
- result = data_missing.map(lambda x: x, na_action=na_action)
- if data_missing.dtype == "float32[pyarrow]":
- # map roundtrips through objects, which converts to float64
- expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
- else:
- expected = data_missing.to_numpy()
- tm.assert_numpy_array_equal(result, expected)
- def test_astype_str(self, data, request, using_infer_string):
- pa_dtype = data.dtype.pyarrow_dtype
- if pa.types.is_binary(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(
- reason=f"For {pa_dtype} .astype(str) decodes.",
- )
- )
- elif not using_infer_string and (
- (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
- or pa.types.is_duration(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
- )
- )
- super().test_astype_str(data)
- def test_from_dtype(self, data, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
- if pa.types.is_string(pa_dtype):
- reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')"
- else:
- reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}"
- request.applymarker(
- pytest.mark.xfail(
- reason=reason,
- )
- )
- super().test_from_dtype(data)
- def test_from_sequence_pa_array(self, data):
- # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
- # data._pa_array = pa.ChunkedArray
- result = type(data)._from_sequence(data._pa_array, dtype=data.dtype)
- tm.assert_extension_array_equal(result, data)
- assert isinstance(result._pa_array, pa.ChunkedArray)
- result = type(data)._from_sequence(
- data._pa_array.combine_chunks(), dtype=data.dtype
- )
- tm.assert_extension_array_equal(result, data)
- assert isinstance(result._pa_array, pa.ChunkedArray)
- def test_from_sequence_pa_array_notimplemented(self, request):
- with pytest.raises(NotImplementedError, match="Converting strings to"):
- ArrowExtensionArray._from_sequence_of_strings(
- ["12-1"], dtype=pa.month_day_nano_interval()
- )
- def test_from_sequence_of_strings_pa_array(self, data, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]") and not PY311:
- request.applymarker(
- pytest.mark.xfail(
- reason="Nanosecond time parsing not supported.",
- )
- )
- elif pa_version_under11p0 and (
- pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- raises=pa.ArrowNotImplementedError,
- reason=f"pyarrow doesn't support parsing {pa_dtype}",
- )
- )
- elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
- _require_timezone_database(request)
- pa_array = data._pa_array.cast(pa.string())
- result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
- tm.assert_extension_array_equal(result, data)
- pa_array = pa_array.combine_chunks()
- result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
- tm.assert_extension_array_equal(result, data)
- def check_accumulate(self, ser, op_name, skipna):
- result = getattr(ser, op_name)(skipna=skipna)
- pa_type = ser.dtype.pyarrow_dtype
- if pa.types.is_temporal(pa_type):
- # Just check that we match the integer behavior.
- if pa_type.bit_width == 32:
- int_type = "int32[pyarrow]"
- else:
- int_type = "int64[pyarrow]"
- ser = ser.astype(int_type)
- result = result.astype(int_type)
- result = result.astype("Float64")
- expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna)
- tm.assert_series_equal(result, expected, check_dtype=False)
- def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
- # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
- # attribute "pyarrow_dtype"
- pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr]
- if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type):
- if op_name in ["cumsum", "cumprod", "cummax", "cummin"]:
- return False
- elif pa.types.is_string(pa_type):
- if op_name == "cumprod":
- return False
- elif pa.types.is_boolean(pa_type):
- if op_name in ["cumprod", "cummax", "cummin"]:
- return False
- elif pa.types.is_temporal(pa_type):
- if op_name == "cumsum" and not pa.types.is_duration(pa_type):
- return False
- elif op_name == "cumprod":
- return False
- return True
- @pytest.mark.parametrize("skipna", [True, False])
- def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request):
- pa_type = data.dtype.pyarrow_dtype
- op_name = all_numeric_accumulations
- if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]:
- # https://github.com/pandas-dev/pandas/pull/60633
- # Doesn't fit test structure, tested in series/test_cumulative.py instead.
- return
- ser = pd.Series(data)
- if not self._supports_accumulation(ser, op_name):
- # The base class test will check that we raise
- return super().test_accumulate_series(
- data, all_numeric_accumulations, skipna
- )
- if pa_version_under13p0 and all_numeric_accumulations != "cumsum":
- # xfailing takes a long time to run because pytest
- # renders the exception messages even when not showing them
- opt = request.config.option
- if opt.markexpr and "not slow" in opt.markexpr:
- pytest.skip(
- f"{all_numeric_accumulations} not implemented for pyarrow < 9"
- )
- mark = pytest.mark.xfail(
- reason=f"{all_numeric_accumulations} not implemented for pyarrow < 9"
- )
- request.applymarker(mark)
- elif all_numeric_accumulations == "cumsum" and (
- pa.types.is_boolean(pa_type) or pa.types.is_decimal(pa_type)
- ):
- request.applymarker(
- pytest.mark.xfail(
- reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
- raises=TypeError,
- )
- )
- self.check_accumulate(ser, op_name, skipna)
- def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
- if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"):
- return False
- dtype = ser.dtype
- # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has
- # no attribute "pyarrow_dtype"
- pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr]
- if pa.types.is_temporal(pa_dtype) and op_name in [
- "sum",
- "var",
- "skew",
- "kurt",
- "prod",
- ]:
- if pa.types.is_duration(pa_dtype) and op_name in ["sum"]:
- # summing timedeltas is one case that *is* well-defined
- pass
- else:
- return False
- elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]:
- return False
- elif (
- pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
- ) and op_name in [
- "mean",
- "median",
- "prod",
- "std",
- "sem",
- "var",
- "skew",
- "kurt",
- ]:
- return False
- if (
- pa.types.is_temporal(pa_dtype)
- and not pa.types.is_duration(pa_dtype)
- and op_name in ["any", "all"]
- ):
- # xref GH#34479 we support this in our non-pyarrow datetime64 dtypes,
- # but it isn't obvious we _should_. For now, we keep the pyarrow
- # behavior which does not support this.
- return False
- return True
- def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
- # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
- # attribute "pyarrow_dtype"
- pa_dtype = ser.dtype.pyarrow_dtype # type: ignore[union-attr]
- if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
- alt = ser.astype("Float64")
- else:
- # TODO: in the opposite case, aren't we testing... nothing? For
- # e.g. date/time dtypes trying to calculate 'expected' by converting
- # to object will raise for mean, std etc
- alt = ser
- # TODO: in the opposite case, aren't we testing... nothing?
- if op_name == "count":
- result = getattr(ser, op_name)()
- expected = getattr(alt, op_name)()
- else:
- result = getattr(ser, op_name)(skipna=skipna)
- expected = getattr(alt, op_name)(skipna=skipna)
- tm.assert_almost_equal(result, expected)
- @pytest.mark.parametrize("skipna", [True, False])
- def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
- dtype = data.dtype
- pa_dtype = dtype.pyarrow_dtype
- xfail_mark = pytest.mark.xfail(
- raises=TypeError,
- reason=(
- f"{all_numeric_reductions} is not implemented in "
- f"pyarrow={pa.__version__} for {pa_dtype}"
- ),
- )
- if pa.types.is_boolean(pa_dtype) and all_numeric_reductions in {
- "sem",
- "std",
- "var",
- "median",
- }:
- request.applymarker(xfail_mark)
- elif (
- not pa_version_under20p0
- and all_numeric_reductions == "skew"
- and (
- pa.types.is_boolean(pa_dtype)
- or (
- skipna
- and (
- pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)
- )
- )
- )
- ):
- request.applymarker(
- pytest.mark.xfail(
- reason="https://github.com/apache/arrow/issues/45733",
- )
- )
- super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
- @pytest.mark.parametrize("skipna", [True, False])
- def test_reduce_series_boolean(
- self, data, all_boolean_reductions, skipna, na_value, request
- ):
- pa_dtype = data.dtype.pyarrow_dtype
- xfail_mark = pytest.mark.xfail(
- raises=TypeError,
- reason=(
- f"{all_boolean_reductions} is not implemented in "
- f"pyarrow={pa.__version__} for {pa_dtype}"
- ),
- )
- if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype):
- # We *might* want to make this behave like the non-pyarrow cases,
- # but have not yet decided.
- request.applymarker(xfail_mark)
- return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna)
- def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
- pa_type = arr._pa_array.type
- if op_name in ["max", "min"]:
- cmp_dtype = arr.dtype
- elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
- if op_name == "sum" and not pa_version_under21p0:
- # https://github.com/apache/arrow/pull/44184
- cmp_dtype = ArrowDtype(pa.decimal128(38, 3))
- elif op_name not in ["median", "var", "std", "skew"]:
- cmp_dtype = arr.dtype
- else:
- cmp_dtype = "float64[pyarrow]"
- elif op_name in ["median", "var", "std", "mean", "skew"]:
- cmp_dtype = "float64[pyarrow]"
- elif op_name == "sum" and pa.types.is_string(pa_type):
- cmp_dtype = arr.dtype
- else:
- cmp_dtype = {
- "i": "int64[pyarrow]",
- "u": "uint64[pyarrow]",
- "f": "float64[pyarrow]",
- }[arr.dtype.kind]
- return cmp_dtype
- @pytest.mark.parametrize("skipna", [True, False])
- def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
- op_name = all_numeric_reductions
- if op_name == "skew" and pa_version_under20p0:
- if data.dtype._is_numeric:
- mark = pytest.mark.xfail(reason="skew not implemented")
- request.applymarker(mark)
- return super().test_reduce_frame(data, all_numeric_reductions, skipna)
- @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
- def test_median_not_approximate(self, typ):
- # GH 52679
- result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
- assert result == 1.5
- def test_construct_from_string_own_name(self, dtype, request):
- pa_dtype = dtype.pyarrow_dtype
- if pa.types.is_decimal(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(
- raises=NotImplementedError,
- reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
- )
- )
- if pa.types.is_string(pa_dtype):
- # We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
- msg = r"string\[pyarrow\] should be constructed by StringDtype"
- with pytest.raises(TypeError, match=msg):
- dtype.construct_from_string(dtype.name)
- return
- super().test_construct_from_string_own_name(dtype)
- def test_is_dtype_from_name(self, dtype, request):
- pa_dtype = dtype.pyarrow_dtype
- if pa.types.is_string(pa_dtype):
- # We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
- assert not type(dtype).is_dtype(dtype.name)
- else:
- if pa.types.is_decimal(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(
- raises=NotImplementedError,
- reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
- )
- )
- super().test_is_dtype_from_name(dtype)
- def test_construct_from_string_another_type_raises(self, dtype):
- msg = r"'another_type' must end with '\[pyarrow\]'"
- with pytest.raises(TypeError, match=msg):
- type(dtype).construct_from_string("another_type")
- def test_get_common_dtype(self, dtype, request):
- pa_dtype = dtype.pyarrow_dtype
- if (
- pa.types.is_date(pa_dtype)
- or pa.types.is_time(pa_dtype)
- or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None)
- or pa.types.is_binary(pa_dtype)
- or pa.types.is_decimal(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- reason=(
- f"{pa_dtype} does not have associated numpy "
- f"dtype findable by find_common_type"
- )
- )
- )
- super().test_get_common_dtype(dtype)
- def test_is_not_string_type(self, dtype):
- pa_dtype = dtype.pyarrow_dtype
- if pa.types.is_string(pa_dtype):
- assert is_string_dtype(dtype)
- else:
- super().test_is_not_string_type(dtype)
- @pytest.mark.xfail(
- reason="GH 45419: pyarrow.ChunkedArray does not support views.", run=False
- )
- def test_view(self, data):
- super().test_view(data)
- def test_fillna_no_op_returns_copy(self, data):
- data = data[~data.isna()]
- valid = data[0]
- result = data.fillna(valid)
- assert result is not data
- tm.assert_extension_array_equal(result, data)
- result = data.fillna(method="backfill")
- assert result is not data
- tm.assert_extension_array_equal(result, data)
- @pytest.mark.xfail(
- reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False
- )
- def test_transpose(self, data):
- super().test_transpose(data)
- @pytest.mark.xfail(
- reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False
- )
- def test_setitem_preserves_views(self, data):
- super().test_setitem_preserves_views(data)
- @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default])
- @pytest.mark.parametrize("engine", ["c", "python"])
- def test_EA_types(self, engine, data, dtype_backend, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if pa.types.is_decimal(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(
- raises=NotImplementedError,
- reason=f"Parameterized types {pa_dtype} not supported.",
- )
- )
- elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"):
- request.applymarker(
- pytest.mark.xfail(
- raises=ValueError,
- reason="https://github.com/pandas-dev/pandas/issues/49767",
- )
- )
- elif pa.types.is_binary(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
- )
- df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
- csv_output = df.to_csv(index=False, na_rep=np.nan)
- if pa.types.is_binary(pa_dtype):
- csv_output = BytesIO(csv_output)
- else:
- csv_output = StringIO(csv_output)
- result = pd.read_csv(
- csv_output,
- dtype={"with_dtype": str(data.dtype)},
- engine=engine,
- dtype_backend=dtype_backend,
- )
- expected = df
- tm.assert_frame_equal(result, expected)
- def test_invert(self, data, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if not (
- pa.types.is_boolean(pa_dtype)
- or pa.types.is_integer(pa_dtype)
- or pa.types.is_string(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- raises=pa.ArrowNotImplementedError,
- reason=f"pyarrow.compute.invert does support {pa_dtype}",
- )
- )
- if PY312 and pa.types.is_boolean(pa_dtype):
- with tm.assert_produces_warning(
- DeprecationWarning, match="Bitwise inversion", check_stacklevel=False
- ):
- super().test_invert(data)
- else:
- super().test_invert(data)
- @pytest.mark.parametrize("periods", [1, -2])
- def test_diff(self, data, periods, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if pa.types.is_unsigned_integer(pa_dtype) and periods == 1:
- request.applymarker(
- pytest.mark.xfail(
- raises=pa.ArrowInvalid,
- reason=(
- f"diff with {pa_dtype} and periods={periods} will overflow"
- ),
- )
- )
- super().test_diff(data, periods)
- def test_value_counts_returns_pyarrow_int64(self, data):
- # GH 51462
- data = data[:10]
- result = data.value_counts()
- assert result.dtype == ArrowDtype(pa.int64())
- _combine_le_expected_dtype = "bool[pyarrow]"
- def get_op_from_name(self, op_name):
- short_opname = op_name.strip("_")
- if short_opname == "rtruediv":
- # use the numpy version that won't raise on division by zero
- def rtruediv(x, y):
- return np.divide(y, x)
- return rtruediv
- elif short_opname == "rfloordiv":
- return lambda x, y: np.floor_divide(y, x)
- return tm.get_op_from_name(op_name)
- def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
- # BaseOpsUtil._combine can upcast expected dtype
- # (because it generates expected on python scalars)
- # while ArrowExtensionArray maintains original type
- expected = pointwise_result
- if op_name in ["eq", "ne", "lt", "le", "gt", "ge"]:
- return pointwise_result.astype("boolean[pyarrow]")
- was_frame = False
- if isinstance(expected, pd.DataFrame):
- was_frame = True
- expected_data = expected.iloc[:, 0]
- original_dtype = obj.iloc[:, 0].dtype
- else:
- expected_data = expected
- original_dtype = obj.dtype
- orig_pa_type = original_dtype.pyarrow_dtype
- if not was_frame and isinstance(other, pd.Series):
- # i.e. test_arith_series_with_array
- if not (
- pa.types.is_floating(orig_pa_type)
- or (
- pa.types.is_integer(orig_pa_type)
- and op_name not in ["__truediv__", "__rtruediv__"]
- )
- or pa.types.is_duration(orig_pa_type)
- or pa.types.is_timestamp(orig_pa_type)
- or pa.types.is_date(orig_pa_type)
- or pa.types.is_decimal(orig_pa_type)
- ):
- # base class _combine always returns int64, while
- # ArrowExtensionArray does not upcast
- return expected
- elif not (
- (op_name == "__floordiv__" and pa.types.is_integer(orig_pa_type))
- or pa.types.is_duration(orig_pa_type)
- or pa.types.is_timestamp(orig_pa_type)
- or pa.types.is_date(orig_pa_type)
- or pa.types.is_decimal(orig_pa_type)
- ):
- # base class _combine always returns int64, while
- # ArrowExtensionArray does not upcast
- return expected
- pa_expected = pa.array(expected_data._values)
- if pa.types.is_duration(pa_expected.type):
- if pa.types.is_date(orig_pa_type):
- if pa.types.is_date64(orig_pa_type):
- # TODO: why is this different vs date32?
- unit = "ms"
- else:
- unit = "s"
- else:
- # pyarrow sees sequence of datetime/timedelta objects and defaults
- # to "us" but the non-pointwise op retains unit
- # timestamp or duration
- unit = orig_pa_type.unit
- if type(other) in [datetime, timedelta] and unit in ["s", "ms"]:
- # pydatetime/pytimedelta objects have microsecond reso, so we
- # take the higher reso of the original and microsecond. Note
- # this matches what we would do with DatetimeArray/TimedeltaArray
- unit = "us"
- pa_expected = pa_expected.cast(f"duration[{unit}]")
- elif pa.types.is_decimal(pa_expected.type) and pa.types.is_decimal(
- orig_pa_type
- ):
- # decimal precision can resize in the result type depending on data
- # just compare the float values
- alt = getattr(obj, op_name)(other)
- alt_dtype = tm.get_dtype(alt)
- assert isinstance(alt_dtype, ArrowDtype)
- if op_name == "__pow__" and isinstance(other, Decimal):
- # TODO: would it make more sense to retain Decimal here?
- alt_dtype = ArrowDtype(pa.float64())
- elif (
- op_name == "__pow__"
- and isinstance(other, pd.Series)
- and other.dtype == original_dtype
- ):
- # TODO: would it make more sense to retain Decimal here?
- alt_dtype = ArrowDtype(pa.float64())
- else:
- assert pa.types.is_decimal(alt_dtype.pyarrow_dtype)
- return expected.astype(alt_dtype)
- else:
- pa_expected = pa_expected.cast(orig_pa_type)
- pd_expected = type(expected_data._values)(pa_expected)
- if was_frame:
- expected = pd.DataFrame(
- pd_expected, index=expected.index, columns=expected.columns
- )
- else:
- expected = pd.Series(pd_expected)
- return expected
- def _is_temporal_supported(self, opname, pa_dtype):
- return (
- (
- opname in ("__add__", "__radd__")
- or (
- opname
- in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__")
- and not pa_version_under14p0
- )
- )
- and pa.types.is_duration(pa_dtype)
- or opname in ("__sub__", "__rsub__")
- and pa.types.is_temporal(pa_dtype)
- )
- def _get_expected_exception(
- self, op_name: str, obj, other
- ) -> type[Exception] | tuple[type[Exception], ...] | None:
- if op_name in ("__divmod__", "__rdivmod__"):
- return (NotImplementedError, TypeError)
- exc: type[Exception] | tuple[type[Exception], ...] | None
- dtype = tm.get_dtype(obj)
- # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
- # attribute "pyarrow_dtype"
- pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr]
- arrow_temporal_supported = self._is_temporal_supported(op_name, pa_dtype)
- if op_name in {
- "__mod__",
- "__rmod__",
- }:
- exc = (NotImplementedError, TypeError)
- elif arrow_temporal_supported:
- exc = None
- elif op_name in ["__add__", "__radd__"] and (
- pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
- ):
- exc = None
- elif not (
- pa.types.is_floating(pa_dtype)
- or pa.types.is_integer(pa_dtype)
- or pa.types.is_decimal(pa_dtype)
- ):
- exc = TypeError
- else:
- exc = None
- return exc
- def _get_arith_xfail_marker(self, opname, pa_dtype):
- mark = None
- arrow_temporal_supported = self._is_temporal_supported(opname, pa_dtype)
- if opname == "__rpow__" and (
- pa.types.is_floating(pa_dtype)
- or pa.types.is_integer(pa_dtype)
- or pa.types.is_decimal(pa_dtype)
- ):
- mark = pytest.mark.xfail(
- reason=(
- f"GH#29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL "
- f"for {pa_dtype}"
- )
- )
- elif arrow_temporal_supported and (
- pa.types.is_time(pa_dtype)
- or (
- opname
- in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__")
- and pa.types.is_duration(pa_dtype)
- )
- ):
- mark = pytest.mark.xfail(
- raises=TypeError,
- reason=(
- f"{opname} not supported between"
- f"pd.NA and {pa_dtype} Python scalar"
- ),
- )
- elif opname == "__rfloordiv__" and (
- pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype)
- ):
- mark = pytest.mark.xfail(
- raises=pa.ArrowInvalid,
- reason="divide by 0",
- )
- elif opname == "__rtruediv__" and pa.types.is_decimal(pa_dtype):
- mark = pytest.mark.xfail(
- raises=pa.ArrowInvalid,
- reason="divide by 0",
- )
- return mark
- def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype):
- pytest.skip("Skip testing Python string formatting")
- mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
- if mark is not None:
- request.applymarker(mark)
- super().test_arith_series_with_scalar(data, all_arithmetic_operators)
- def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if all_arithmetic_operators == "__rmod__" and (
- pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
- ):
- pytest.skip("Skip testing Python string formatting")
- mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
- if mark is not None:
- request.applymarker(mark)
- super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
- def test_arith_series_with_array(self, data, all_arithmetic_operators, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if all_arithmetic_operators in (
- "__sub__",
- "__rsub__",
- ) and pa.types.is_unsigned_integer(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(
- raises=pa.ArrowInvalid,
- reason=(
- f"Implemented pyarrow.compute.subtract_checked "
- f"which raises on overflow for {pa_dtype}"
- ),
- )
- )
- mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
- if mark is not None:
- request.applymarker(mark)
- op_name = all_arithmetic_operators
- ser = pd.Series(data)
- # pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray
- # since ser.iloc[0] is a python scalar
- other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype))
- self.check_opname(ser, op_name, other)
- def test_add_series_with_extension_array(self, data, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if pa_dtype.equals("int8"):
- request.applymarker(
- pytest.mark.xfail(
- raises=pa.ArrowInvalid,
- reason=f"raises on overflow for {pa_dtype}",
- )
- )
- super().test_add_series_with_extension_array(data)
- def test_invalid_other_comp(self, data, comparison_op):
- # GH 48833
- with pytest.raises(
- NotImplementedError, match=".* not implemented for <class 'object'>"
- ):
- comparison_op(data, object())
- @pytest.mark.parametrize("masked_dtype", ["boolean", "Int64", "Float64"])
- def test_comp_masked_numpy(self, masked_dtype, comparison_op):
- # GH 52625
- data = [1, 0, None]
- ser_masked = pd.Series(data, dtype=masked_dtype)
- ser_pa = pd.Series(data, dtype=f"{masked_dtype.lower()}[pyarrow]")
- result = comparison_op(ser_pa, ser_masked)
- if comparison_op in [operator.lt, operator.gt, operator.ne]:
- exp = [False, False, None]
- else:
- exp = [True, True, None]
- expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- class TestLogicalOps:
- """Various Series and DataFrame logical ops methods."""
- def test_kleene_or(self):
- a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
- b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
- result = a | b
- expected = pd.Series(
- [True, True, True, True, False, None, True, None, None],
- dtype="boolean[pyarrow]",
- )
- tm.assert_series_equal(result, expected)
- result = b | a
- tm.assert_series_equal(result, expected)
- # ensure we haven't mutated anything inplace
- tm.assert_series_equal(
- a,
- pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
- )
- tm.assert_series_equal(
- b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
- )
- @pytest.mark.parametrize(
- "other, expected",
- [
- (None, [True, None, None]),
- (pd.NA, [True, None, None]),
- (True, [True, True, True]),
- (np.bool_(True), [True, True, True]),
- (False, [True, False, None]),
- (np.bool_(False), [True, False, None]),
- ],
- )
- def test_kleene_or_scalar(self, other, expected):
- a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
- result = a | other
- expected = pd.Series(expected, dtype="boolean[pyarrow]")
- tm.assert_series_equal(result, expected)
- result = other | a
- tm.assert_series_equal(result, expected)
- # ensure we haven't mutated anything inplace
- tm.assert_series_equal(
- a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
- )
- def test_kleene_and(self):
- a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
- b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
- result = a & b
- expected = pd.Series(
- [True, False, None, False, False, False, None, False, None],
- dtype="boolean[pyarrow]",
- )
- tm.assert_series_equal(result, expected)
- result = b & a
- tm.assert_series_equal(result, expected)
- # ensure we haven't mutated anything inplace
- tm.assert_series_equal(
- a,
- pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
- )
- tm.assert_series_equal(
- b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
- )
- @pytest.mark.parametrize(
- "other, expected",
- [
- (None, [None, False, None]),
- (pd.NA, [None, False, None]),
- (True, [True, False, None]),
- (False, [False, False, False]),
- (np.bool_(True), [True, False, None]),
- (np.bool_(False), [False, False, False]),
- ],
- )
- def test_kleene_and_scalar(self, other, expected):
- a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
- result = a & other
- expected = pd.Series(expected, dtype="boolean[pyarrow]")
- tm.assert_series_equal(result, expected)
- result = other & a
- tm.assert_series_equal(result, expected)
- # ensure we haven't mutated anything inplace
- tm.assert_series_equal(
- a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
- )
- def test_kleene_xor(self):
- a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
- b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
- result = a ^ b
- expected = pd.Series(
- [False, True, None, True, False, None, None, None, None],
- dtype="boolean[pyarrow]",
- )
- tm.assert_series_equal(result, expected)
- result = b ^ a
- tm.assert_series_equal(result, expected)
- # ensure we haven't mutated anything inplace
- tm.assert_series_equal(
- a,
- pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
- )
- tm.assert_series_equal(
- b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
- )
- @pytest.mark.parametrize(
- "other, expected",
- [
- (None, [None, None, None]),
- (pd.NA, [None, None, None]),
- (True, [False, True, None]),
- (np.bool_(True), [False, True, None]),
- (np.bool_(False), [True, False, None]),
- ],
- )
- def test_kleene_xor_scalar(self, other, expected):
- a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
- result = a ^ other
- expected = pd.Series(expected, dtype="boolean[pyarrow]")
- tm.assert_series_equal(result, expected)
- result = other ^ a
- tm.assert_series_equal(result, expected)
- # ensure we haven't mutated anything inplace
- tm.assert_series_equal(
- a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
- )
- @pytest.mark.parametrize(
- "op, exp",
- [
- ["__and__", True],
- ["__or__", True],
- ["__xor__", False],
- ],
- )
- def test_logical_masked_numpy(self, op, exp):
- # GH 52625
- data = [True, False, None]
- ser_masked = pd.Series(data, dtype="boolean")
- ser_pa = pd.Series(data, dtype="boolean[pyarrow]")
- result = getattr(ser_pa, op)(ser_masked)
- expected = pd.Series([exp, False, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
- def test_bitwise(pa_type):
- # GH 54495
- dtype = ArrowDtype(pa_type)
- left = pd.Series([1, None, 3, 4], dtype=dtype)
- right = pd.Series([None, 3, 5, 4], dtype=dtype)
- result = left | right
- expected = pd.Series([None, None, 3 | 5, 4 | 4], dtype=dtype)
- tm.assert_series_equal(result, expected)
- result = left & right
- expected = pd.Series([None, None, 3 & 5, 4 & 4], dtype=dtype)
- tm.assert_series_equal(result, expected)
- result = left ^ right
- expected = pd.Series([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype)
- tm.assert_series_equal(result, expected)
- result = ~left
- expected = ~(left.fillna(0).to_numpy())
- expected = pd.Series(expected, dtype=dtype).mask(left.isnull())
- tm.assert_series_equal(result, expected)
- def test_arrowdtype_construct_from_string_type_with_unsupported_parameters():
- with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
- ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]")
- with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
- ArrowDtype.construct_from_string("decimal(7, 2)[pyarrow]")
- def test_arrowdtype_construct_from_string_supports_dt64tz():
- # as of GH#50689, timestamptz is supported
- dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]")
- expected = ArrowDtype(pa.timestamp("s", "UTC"))
- assert dtype == expected
- def test_arrowdtype_construct_from_string_type_only_one_pyarrow():
- # GH#51225
- invalid = "int64[pyarrow]foobar[pyarrow]"
- msg = (
- r"Passing pyarrow type specific parameters \(\[pyarrow\]\) in the "
- r"string is not supported\."
- )
- with pytest.raises(NotImplementedError, match=msg):
- pd.Series(range(3), dtype=invalid)
- def test_arrow_string_multiplication():
- # GH 56537
- binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string()))
- repeat = pd.Series([2, -2], dtype="int64[pyarrow]")
- result = binary * repeat
- expected = pd.Series(["abcabc", ""], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- reflected_result = repeat * binary
- tm.assert_series_equal(result, reflected_result)
- def test_arrow_string_multiplication_scalar_repeat():
- binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string()))
- result = binary * 2
- expected = pd.Series(["abcabc", "defgdefg"], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- reflected_result = 2 * binary
- tm.assert_series_equal(reflected_result, expected)
- @pytest.mark.parametrize(
- "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
- )
- @pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]])
- def test_quantile(data, interpolation, quantile, request):
- pa_dtype = data.dtype.pyarrow_dtype
- data = data.take([0, 0, 0])
- ser = pd.Series(data)
- if (
- pa.types.is_string(pa_dtype)
- or pa.types.is_binary(pa_dtype)
- or pa.types.is_boolean(pa_dtype)
- ):
- # For string, bytes, and bool, we don't *expect* to have quantile work
- # Note this matches the non-pyarrow behavior
- msg = r"Function 'quantile' has no kernel matching input types \(.*\)"
- with pytest.raises(pa.ArrowNotImplementedError, match=msg):
- ser.quantile(q=quantile, interpolation=interpolation)
- return
- if (
- pa.types.is_integer(pa_dtype)
- or pa.types.is_floating(pa_dtype)
- or pa.types.is_decimal(pa_dtype)
- ):
- pass
- elif pa.types.is_temporal(data._pa_array.type):
- pass
- else:
- request.applymarker(
- pytest.mark.xfail(
- raises=pa.ArrowNotImplementedError,
- reason=f"quantile not supported by pyarrow for {pa_dtype}",
- )
- )
- data = data.take([0, 0, 0])
- ser = pd.Series(data)
- result = ser.quantile(q=quantile, interpolation=interpolation)
- if pa.types.is_timestamp(pa_dtype) and interpolation not in ["lower", "higher"]:
- # rounding error will make the check below fail
- # (e.g. '2020-01-01 01:01:01.000001' vs '2020-01-01 01:01:01.000001024'),
- # so we'll check for now that we match the numpy analogue
- if pa_dtype.tz:
- pd_dtype = f"M8[{pa_dtype.unit}, {pa_dtype.tz}]"
- else:
- pd_dtype = f"M8[{pa_dtype.unit}]"
- ser_np = ser.astype(pd_dtype)
- expected = ser_np.quantile(q=quantile, interpolation=interpolation)
- if quantile == 0.5:
- if pa_dtype.unit == "us":
- expected = expected.to_pydatetime(warn=False)
- assert result == expected
- else:
- if pa_dtype.unit == "us":
- expected = expected.dt.floor("us")
- tm.assert_series_equal(result, expected.astype(data.dtype))
- return
- if quantile == 0.5:
- assert result == data[0]
- else:
- # Just check the values
- expected = pd.Series(data.take([0, 0]), index=[0.5, 0.5])
- if (
- pa.types.is_integer(pa_dtype)
- or pa.types.is_floating(pa_dtype)
- or pa.types.is_decimal(pa_dtype)
- ):
- expected = expected.astype("float64[pyarrow]")
- result = result.astype("float64[pyarrow]")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "take_idx, exp_idx",
- [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
- ids=["multi_mode", "single_mode"],
- )
- def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx):
- data = data_for_grouping.take(take_idx)
- ser = pd.Series(data)
- result = ser.mode(dropna=True)
- expected = pd.Series(data_for_grouping.take(exp_idx))
- tm.assert_series_equal(result, expected)
- def test_mode_dropna_false_mode_na(data):
- # GH 50982
- more_nans = pd.Series([None, None, data[0]], dtype=data.dtype)
- result = more_nans.mode(dropna=False)
- expected = pd.Series([None], dtype=data.dtype)
- tm.assert_series_equal(result, expected)
- expected = pd.Series([data[0], None], dtype=data.dtype)
- result = expected.mode(dropna=False)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "arrow_dtype, expected_type",
- [
- [pa.binary(), bytes],
- [pa.binary(16), bytes],
- [pa.large_binary(), bytes],
- [pa.large_string(), str],
- [pa.list_(pa.int64()), list],
- [pa.large_list(pa.int64()), list],
- [pa.map_(pa.string(), pa.int64()), list],
- [pa.struct([("f1", pa.int8()), ("f2", pa.string())]), dict],
- [pa.dictionary(pa.int64(), pa.int64()), CategoricalDtypeType],
- ],
- )
- def test_arrow_dtype_type(arrow_dtype, expected_type):
- # GH 51845
- # TODO: Redundant with test_getitem_scalar once arrow_dtype exists in data fixture
- assert ArrowDtype(arrow_dtype).type == expected_type
- def test_is_bool_dtype():
- # GH 22667
- data = ArrowExtensionArray(pa.array([True, False, True]))
- assert is_bool_dtype(data)
- assert pd.core.common.is_bool_indexer(data)
- s = pd.Series(range(len(data)))
- result = s[data]
- expected = s[np.asarray(data)]
- tm.assert_series_equal(result, expected)
- def test_is_numeric_dtype(data):
- # GH 50563
- pa_type = data.dtype.pyarrow_dtype
- if (
- pa.types.is_floating(pa_type)
- or pa.types.is_integer(pa_type)
- or pa.types.is_decimal(pa_type)
- ):
- assert is_numeric_dtype(data)
- else:
- assert not is_numeric_dtype(data)
- def test_is_integer_dtype(data):
- # GH 50667
- pa_type = data.dtype.pyarrow_dtype
- if pa.types.is_integer(pa_type):
- assert is_integer_dtype(data)
- else:
- assert not is_integer_dtype(data)
- def test_is_signed_integer_dtype(data):
- pa_type = data.dtype.pyarrow_dtype
- if pa.types.is_signed_integer(pa_type):
- assert is_signed_integer_dtype(data)
- else:
- assert not is_signed_integer_dtype(data)
- def test_is_unsigned_integer_dtype(data):
- pa_type = data.dtype.pyarrow_dtype
- if pa.types.is_unsigned_integer(pa_type):
- assert is_unsigned_integer_dtype(data)
- else:
- assert not is_unsigned_integer_dtype(data)
- def test_is_float_dtype(data):
- pa_type = data.dtype.pyarrow_dtype
- if pa.types.is_floating(pa_type):
- assert is_float_dtype(data)
- else:
- assert not is_float_dtype(data)
- def test_pickle_roundtrip(data):
- # GH 42600
- expected = pd.Series(data)
- expected_sliced = expected.head(2)
- full_pickled = pickle.dumps(expected)
- sliced_pickled = pickle.dumps(expected_sliced)
- assert len(full_pickled) > len(sliced_pickled)
- result = pickle.loads(full_pickled)
- tm.assert_series_equal(result, expected)
- result_sliced = pickle.loads(sliced_pickled)
- tm.assert_series_equal(result_sliced, expected_sliced)
- def test_astype_from_non_pyarrow(data):
- # GH49795
- pd_array = data._pa_array.to_pandas().array
- result = pd_array.astype(data.dtype)
- assert not isinstance(pd_array.dtype, ArrowDtype)
- assert isinstance(result.dtype, ArrowDtype)
- tm.assert_extension_array_equal(result, data)
- def test_astype_float_from_non_pyarrow_str():
- # GH50430
- ser = pd.Series(["1.0"])
- result = ser.astype("float64[pyarrow]")
- expected = pd.Series([1.0], dtype="float64[pyarrow]")
- tm.assert_series_equal(result, expected)
- def test_astype_errors_ignore():
- # GH 55399
- expected = pd.DataFrame({"col": [17000000]}, dtype="int32[pyarrow]")
- result = expected.astype("float[pyarrow]", errors="ignore")
- tm.assert_frame_equal(result, expected)
- def test_to_numpy_with_defaults(data):
- # GH49973
- result = data.to_numpy()
- pa_type = data._pa_array.type
- if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
- pytest.skip("Tested in test_to_numpy_temporal")
- elif pa.types.is_date(pa_type):
- expected = np.array(list(data))
- else:
- expected = np.array(data._pa_array)
- if data._hasna and not is_numeric_dtype(data.dtype):
- expected = expected.astype(object)
- expected[pd.isna(data)] = pd.NA
- tm.assert_numpy_array_equal(result, expected)
- def test_to_numpy_int_with_na():
- # GH51227: ensure to_numpy does not convert int to float
- data = [1, None]
- arr = pd.array(data, dtype="int64[pyarrow]")
- result = arr.to_numpy()
- expected = np.array([1, np.nan])
- assert isinstance(result[0], float)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("na_val, exp", [(lib.no_default, np.nan), (1, 1)])
- def test_to_numpy_null_array(na_val, exp):
- # GH#52443
- arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]")
- result = arr.to_numpy(dtype="float64", na_value=na_val)
- expected = np.array([exp] * 2, dtype="float64")
- tm.assert_numpy_array_equal(result, expected)
- def test_to_numpy_null_array_no_dtype():
- # GH#52443
- arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]")
- result = arr.to_numpy(dtype=None)
- expected = np.array([pd.NA] * 2, dtype="object")
- tm.assert_numpy_array_equal(result, expected)
- def test_to_numpy_without_dtype():
- # GH 54808
- arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]")
- result = arr.to_numpy(na_value=False)
- expected = np.array([True, False], dtype=np.bool_)
- tm.assert_numpy_array_equal(result, expected)
- arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]")
- result = arr.to_numpy(na_value=0.0)
- expected = np.array([1.0, 0.0], dtype=np.float32)
- tm.assert_numpy_array_equal(result, expected)
- def test_setitem_null_slice(data):
- # GH50248
- orig = data.copy()
- result = orig.copy()
- result[:] = data[0]
- expected = ArrowExtensionArray._from_sequence(
- [data[0]] * len(data),
- dtype=data.dtype,
- )
- tm.assert_extension_array_equal(result, expected)
- result = orig.copy()
- result[:] = data[::-1]
- expected = data[::-1]
- tm.assert_extension_array_equal(result, expected)
- result = orig.copy()
- result[:] = data.tolist()
- expected = data
- tm.assert_extension_array_equal(result, expected)
- def test_setitem_invalid_dtype(data):
- # GH50248
- pa_type = data._pa_array.type
- if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
- fill_value = 123
- err = TypeError
- msg = "Invalid value '123' for dtype"
- elif (
- pa.types.is_integer(pa_type)
- or pa.types.is_floating(pa_type)
- or pa.types.is_boolean(pa_type)
- ):
- fill_value = "foo"
- err = pa.ArrowInvalid
- msg = "Could not convert"
- else:
- fill_value = "foo"
- err = TypeError
- msg = "Invalid value 'foo' for dtype"
- with pytest.raises(err, match=msg):
- data[:] = fill_value
- def test_from_arrow_respecting_given_dtype():
- date_array = pa.array(
- [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")], type=pa.date32()
- )
- result = date_array.to_pandas(
- types_mapper={pa.date32(): ArrowDtype(pa.date64())}.get
- )
- expected = pd.Series(
- [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")],
- dtype=ArrowDtype(pa.date64()),
- )
- tm.assert_series_equal(result, expected)
- def test_from_arrow_respecting_given_dtype_unsafe():
- array = pa.array([1.5, 2.5], type=pa.float64())
- with tm.external_error_raised(pa.ArrowInvalid):
- array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get)
- def test_round():
- dtype = "float64[pyarrow]"
- ser = pd.Series([0.0, 1.23, 2.56, pd.NA], dtype=dtype)
- result = ser.round(1)
- expected = pd.Series([0.0, 1.2, 2.6, pd.NA], dtype=dtype)
- tm.assert_series_equal(result, expected)
- ser = pd.Series([123.4, pd.NA, 56.78], dtype=dtype)
- result = ser.round(-1)
- expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype)
- tm.assert_series_equal(result, expected)
- def test_searchsorted_with_na_raises(data_for_sorting, as_series):
- # GH50447
- b, c, a = data_for_sorting
- arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
- arr[-1] = pd.NA
- if as_series:
- arr = pd.Series(arr)
- msg = (
- "searchsorted requires array to be sorted, "
- "which is impossible with NAs present."
- )
- with pytest.raises(ValueError, match=msg):
- arr.searchsorted(b)
- def test_sort_values_dictionary():
- df = pd.DataFrame(
- {
- "a": pd.Series(
- ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string()))
- ),
- "b": [1, 2],
- },
- )
- expected = df.copy()
- result = df.sort_values(by=["a", "b"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("pat", ["abc", "a[a-z]{2}"])
- def test_str_count(pat):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.count(pat)
- expected = pd.Series([1, None], dtype=ArrowDtype(pa.int32()))
- tm.assert_series_equal(result, expected)
- def test_str_count_flags_unsupported():
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- with pytest.raises(NotImplementedError, match="count not"):
- ser.str.count("abc", flags=1)
- @pytest.mark.parametrize(
- "side, str_func", [["left", "rjust"], ["right", "ljust"], ["both", "center"]]
- )
- def test_str_pad(side, str_func):
- ser = pd.Series(["a", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.pad(width=3, side=side, fillchar="x")
- expected = pd.Series(
- [getattr("a", str_func)(3, "x"), None], dtype=ArrowDtype(pa.string())
- )
- tm.assert_series_equal(result, expected)
- def test_str_pad_invalid_side():
- ser = pd.Series(["a", None], dtype=ArrowDtype(pa.string()))
- with pytest.raises(ValueError, match="Invalid side: foo"):
- ser.str.pad(3, "foo", "x")
- @pytest.mark.parametrize(
- "pat, case, na, regex, exp",
- [
- ["ab", False, None, False, [True, None]],
- ["Ab", True, None, False, [False, None]],
- ["ab", False, True, False, [True, True]],
- ["a[a-z]{1}", False, None, True, [True, None]],
- ["A[a-z]{1}", True, None, True, [False, None]],
- ],
- )
- def test_str_contains(pat, case, na, regex, exp):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.contains(pat, case=case, na=na, regex=regex)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- def test_str_contains_flags_unsupported():
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- with pytest.raises(NotImplementedError, match="contains not"):
- ser.str.contains("a", flags=1)
- @pytest.mark.parametrize(
- "side, pat, na, exp",
- [
- ["startswith", "ab", None, [True, None, False]],
- ["startswith", "b", False, [False, False, False]],
- ["endswith", "b", True, [False, True, False]],
- ["endswith", "bc", None, [True, None, False]],
- ["startswith", ("a", "e", "g"), None, [True, None, True]],
- ["endswith", ("a", "c", "g"), None, [True, None, True]],
- ["startswith", (), None, [False, None, False]],
- ["endswith", (), None, [False, None, False]],
- ],
- )
- def test_str_start_ends_with(side, pat, na, exp):
- ser = pd.Series(["abc", None, "efg"], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, side)(pat, na=na)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("side", ("startswith", "endswith"))
- def test_str_starts_ends_with_all_nulls_empty_tuple(side):
- ser = pd.Series([None, None], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, side)(())
- # bool datatype preserved for all nulls.
- expected = pd.Series([None, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "arg_name, arg",
- [["pat", re.compile("b")], ["repl", str], ["case", False], ["flags", 1]],
- )
- def test_str_replace_unsupported(arg_name, arg):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- kwargs = {"pat": "b", "repl": "x", "regex": True}
- kwargs[arg_name] = arg
- with pytest.raises(NotImplementedError, match="replace is not supported"):
- ser.str.replace(**kwargs)
- @pytest.mark.parametrize(
- "pat, repl, n, regex, exp",
- [
- ["a", "x", -1, False, ["xbxc", None]],
- ["a", "x", 1, False, ["xbac", None]],
- ["[a-b]", "x", -1, True, ["xxxc", None]],
- ],
- )
- def test_str_replace(pat, repl, n, regex, exp):
- ser = pd.Series(["abac", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.replace(pat, repl, n=n, regex=regex)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- def test_str_replace_negative_n():
- # GH 56404
- ser = pd.Series(["abc", "aaaaaa"], dtype=ArrowDtype(pa.string()))
- actual = ser.str.replace("a", "", -3, True)
- expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(expected, actual)
- # Same bug for pyarrow-backed StringArray GH#59628
- ser2 = ser.astype(pd.StringDtype(storage="pyarrow"))
- actual2 = ser2.str.replace("a", "", -3, True)
- expected2 = expected.astype(ser2.dtype)
- tm.assert_series_equal(expected2, actual2)
- ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan))
- actual3 = ser3.str.replace("a", "", -3, True)
- expected3 = expected.astype(ser3.dtype)
- tm.assert_series_equal(expected3, actual3)
- def test_str_repeat_unsupported():
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- with pytest.raises(NotImplementedError, match="repeat is not"):
- ser.str.repeat([1, 2])
- def test_str_repeat():
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.repeat(2)
- expected = pd.Series(["abcabc", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "pat, case, na, exp",
- [
- ["ab", False, None, [True, None]],
- ["Ab", True, None, [False, None]],
- ["bc", True, None, [False, None]],
- ["ab", False, True, [True, True]],
- ["a[a-z]{1}", False, None, [True, None]],
- ["A[a-z]{1}", True, None, [False, None]],
- ],
- )
- def test_str_match(pat, case, na, exp):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.match(pat, case=case, na=na)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "pat, case, na, exp",
- # Note: keep cases in sync with
- # pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
- [
- ["abc", False, None, [True, False, False, None]],
- ["Abc", True, None, [False, False, False, None]],
- ["bc", True, None, [False, False, False, None]],
- ["ab", False, None, [False, False, False, None]],
- ["a[a-z]{2}", False, None, [True, False, False, None]],
- ["A[a-z]{1}", True, None, [False, False, False, None]],
- # GH Issue: #56652
- ["abc$", False, None, [True, False, False, None]],
- ["abc\\$", False, None, [False, True, False, None]],
- ["Abc$", True, None, [False, False, False, None]],
- ["Abc\\$", True, None, [False, False, False, None]],
- # https://github.com/pandas-dev/pandas/issues/61072
- ["(abc)|(abx)", True, None, [True, False, False, None]],
- ["((abc)|(abx))", True, None, [True, False, False, None]],
- ],
- )
- def test_str_fullmatch(pat, case, na, exp):
- ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.fullmatch(pat, case=case, na=na)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "sub, start, end, exp, exp_typ",
- [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]],
- )
- def test_str_find(sub, start, end, exp, exp_typ):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.find(sub, start=start, end=end)
- expected = pd.Series(exp, dtype=ArrowDtype(exp_typ))
- tm.assert_series_equal(result, expected)
- def test_str_find_negative_start():
- # GH 56411
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.find(sub="b", start=-1000, end=3)
- expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- def test_str_find_no_end():
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.find("ab", start=1)
- expected = pd.Series([-1, None], dtype="int64[pyarrow]")
- tm.assert_series_equal(result, expected)
- def test_str_find_negative_start_negative_end():
- # GH 56791
- ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.find(sub="d", start=-6, end=-3)
- expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- def test_str_find_large_start():
- # GH 56791
- ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.find(sub="d", start=16)
- expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.skipif(
- pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311"
- )
- @pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None])
- @pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None])
- @pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"])
- def test_str_find_e2e(start, end, sub):
- s = pd.Series(
- ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""],
- dtype=ArrowDtype(pa.string()),
- )
- object_series = s.astype(pd.StringDtype(storage="python"))
- result = s.str.find(sub, start, end)
- expected = object_series.str.find(sub, start, end).astype(result.dtype)
- tm.assert_series_equal(result, expected)
- arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow"))
- result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype)
- tm.assert_series_equal(result2, expected)
- def test_str_find_negative_start_negative_end_no_match():
- # GH 56791
- ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.find(sub="d", start=-3, end=-6)
- expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "i, exp",
- [
- [1, ["b", "e", None]],
- [-1, ["c", "e", None]],
- [2, ["c", None, None]],
- [-3, ["a", None, None]],
- [4, [None, None, None]],
- ],
- )
- def test_str_get(i, exp):
- ser = pd.Series(["abc", "de", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.get(i)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(
- reason="TODO: StringMethods._validate should support Arrow list types",
- raises=AttributeError,
- )
- def test_str_join():
- ser = pd.Series(ArrowExtensionArray(pa.array([list("abc"), list("123"), None])))
- result = ser.str.join("=")
- expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- def test_str_join_string_type():
- ser = pd.Series(ArrowExtensionArray(pa.array(["abc", "123", None])))
- result = ser.str.join("=")
- expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "start, stop, step, exp",
- [
- [None, 2, None, ["ab", None]],
- [None, 2, 1, ["ab", None]],
- [1, 3, 1, ["bc", None]],
- (None, None, -1, ["dcba", None]),
- ],
- )
- def test_str_slice(start, stop, step, exp):
- ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.slice(start, stop, step)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "start, stop, repl, exp",
- [
- [1, 2, "x", ["axcd", None]],
- [None, 2, "x", ["xcd", None]],
- [None, 2, None, ["cd", None]],
- ],
- )
- def test_str_slice_replace(start, stop, repl, exp):
- ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.slice_replace(start, stop, repl)
- expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "value, method, exp",
- [
- ["a1c", "isalnum", True],
- ["!|,", "isalnum", False],
- ["aaa", "isalpha", True],
- ["!!!", "isalpha", False],
- ["٠", "isdecimal", True], # noqa: RUF001
- ["~!", "isdecimal", False],
- ["2", "isdigit", True],
- ["~", "isdigit", False],
- ["aaa", "islower", True],
- ["aaA", "islower", False],
- ["123", "isnumeric", True],
- ["11I", "isnumeric", False],
- [" ", "isspace", True],
- ["", "isspace", False],
- ["The That", "istitle", True],
- ["the That", "istitle", False],
- ["AAA", "isupper", True],
- ["AAc", "isupper", False],
- ],
- )
- def test_str_is_functions(value, method, exp):
- ser = pd.Series([value, None], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, method)()
- expected = pd.Series([exp, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "method, exp",
- [
- ["capitalize", "Abc def"],
- ["title", "Abc Def"],
- ["swapcase", "AbC Def"],
- ["lower", "abc def"],
- ["upper", "ABC DEF"],
- ["casefold", "abc def"],
- ],
- )
- def test_str_transform_functions(method, exp):
- ser = pd.Series(["aBc dEF", None], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, method)()
- expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- def test_str_len():
- ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.len()
- expected = pd.Series([4, None], dtype=ArrowDtype(pa.int32()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "method, to_strip, val",
- [
- ["strip", None, " abc "],
- ["strip", "x", "xabcx"],
- ["lstrip", None, " abc"],
- ["lstrip", "x", "xabc"],
- ["rstrip", None, "abc "],
- ["rstrip", "x", "abcx"],
- ],
- )
- def test_str_strip(method, to_strip, val):
- ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, method)(to_strip=to_strip)
- expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("val", ["abc123", "abc"])
- def test_str_removesuffix(val):
- ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
- result = ser.str.removesuffix("123")
- expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("val", ["123abc", "abc"])
- def test_str_removeprefix(val):
- ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
- result = ser.str.removeprefix("123")
- expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("errors", ["ignore", "strict"])
- @pytest.mark.parametrize(
- "encoding, exp",
- [
- ["utf8", b"abc"],
- ["utf32", b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00"],
- ],
- )
- def test_str_encode(errors, encoding, exp):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.encode(encoding, errors)
- expected = pd.Series([exp, None], dtype=ArrowDtype(pa.binary()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("flags", [0, 2])
- def test_str_findall(flags):
- ser = pd.Series(["abc", "efg", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.findall("b", flags=flags)
- expected = pd.Series([["b"], [], None], dtype=ArrowDtype(pa.list_(pa.string())))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["index", "rindex"])
- @pytest.mark.parametrize(
- "start, end",
- [
- [0, None],
- [1, 4],
- ],
- )
- def test_str_r_index(method, start, end):
- ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, method)("c", start, end)
- expected = pd.Series([2, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- with pytest.raises(ValueError, match="substring not found"):
- getattr(ser.str, method)("foo", start, end)
- @pytest.mark.parametrize("form", ["NFC", "NFKC"])
- def test_str_normalize(form):
- ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.normalize(form)
- expected = ser.copy()
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "start, end",
- [
- [0, None],
- [1, 4],
- ],
- )
- def test_str_rfind(start, end):
- ser = pd.Series(["abcba", "foo", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.rfind("c", start, end)
- expected = pd.Series([2, -1, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- def test_str_translate():
- ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.translate({97: "b"})
- expected = pd.Series(["bbcbb", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- def test_str_wrap():
- ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.wrap(3)
- expected = pd.Series(["abc\nba", None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- def test_get_dummies():
- ser = pd.Series(["a|b", None, "a|c"], dtype=ArrowDtype(pa.string()))
- result = ser.str.get_dummies()
- expected = pd.DataFrame(
- [[True, True, False], [False, False, False], [True, False, True]],
- dtype=ArrowDtype(pa.bool_()),
- columns=["a", "b", "c"],
- )
- tm.assert_frame_equal(result, expected)
- def test_str_partition():
- ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.partition("b")
- expected = pd.DataFrame(
- [["a", "b", "cba"], [None, None, None]], dtype=ArrowDtype(pa.string())
- )
- tm.assert_frame_equal(result, expected)
- result = ser.str.partition("b", expand=False)
- expected = pd.Series(ArrowExtensionArray(pa.array([["a", "b", "cba"], None])))
- tm.assert_series_equal(result, expected)
- result = ser.str.rpartition("b")
- expected = pd.DataFrame(
- [["abc", "b", "a"], [None, None, None]], dtype=ArrowDtype(pa.string())
- )
- tm.assert_frame_equal(result, expected)
- result = ser.str.rpartition("b", expand=False)
- expected = pd.Series(ArrowExtensionArray(pa.array([["abc", "b", "a"], None])))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["rsplit", "split"])
- def test_str_split_pat_none(method):
- # GH 56271
- ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string()))
- result = getattr(ser.str, method)()
- expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None])))
- tm.assert_series_equal(result, expected)
- def test_str_split():
- # GH 52401
- ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.split("c")
- expected = pd.Series(
- ArrowExtensionArray(pa.array([["a1", "b", "b"], ["a2", "b", "b"], None]))
- )
- tm.assert_series_equal(result, expected)
- result = ser.str.split("c", n=1)
- expected = pd.Series(
- ArrowExtensionArray(pa.array([["a1", "bcb"], ["a2", "bcb"], None]))
- )
- tm.assert_series_equal(result, expected)
- result = ser.str.split("[1-2]", regex=True)
- expected = pd.Series(
- ArrowExtensionArray(pa.array([["a", "cbcb"], ["a", "cbcb"], None]))
- )
- tm.assert_series_equal(result, expected)
- result = ser.str.split("[1-2]", regex=True, expand=True)
- expected = pd.DataFrame(
- {
- 0: ArrowExtensionArray(pa.array(["a", "a", None])),
- 1: ArrowExtensionArray(pa.array(["cbcb", "cbcb", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
- result = ser.str.split("1", expand=True)
- expected = pd.DataFrame(
- {
- 0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])),
- 1: ArrowExtensionArray(pa.array(["cbcb", None, None])),
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_str_rsplit():
- # GH 52401
- ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
- result = ser.str.rsplit("c")
- expected = pd.Series(
- ArrowExtensionArray(pa.array([["a1", "b", "b"], ["a2", "b", "b"], None]))
- )
- tm.assert_series_equal(result, expected)
- result = ser.str.rsplit("c", n=1)
- expected = pd.Series(
- ArrowExtensionArray(pa.array([["a1cb", "b"], ["a2cb", "b"], None]))
- )
- tm.assert_series_equal(result, expected)
- result = ser.str.rsplit("c", n=1, expand=True)
- expected = pd.DataFrame(
- {
- 0: ArrowExtensionArray(pa.array(["a1cb", "a2cb", None])),
- 1: ArrowExtensionArray(pa.array(["b", "b", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
- result = ser.str.rsplit("1", expand=True)
- expected = pd.DataFrame(
- {
- 0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])),
- 1: ArrowExtensionArray(pa.array(["cbcb", None, None])),
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_str_extract_non_symbolic():
- ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
- with pytest.raises(ValueError, match="pat=.* must contain a symbolic group name."):
- ser.str.extract(r"[ab](\d)")
- @pytest.mark.parametrize("expand", [True, False])
- def test_str_extract(expand):
- ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
- result = ser.str.extract(r"(?P<letter>[ab])(?P<digit>\d)", expand=expand)
- expected = pd.DataFrame(
- {
- "letter": ArrowExtensionArray(pa.array(["a", "b", None])),
- "digit": ArrowExtensionArray(pa.array(["1", "2", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_str_extract_expand():
- ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
- result = ser.str.extract(r"[ab](?P<digit>\d)", expand=True)
- expected = pd.DataFrame(
- {
- "digit": ArrowExtensionArray(pa.array(["1", "2", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
- result = ser.str.extract(r"[ab](?P<digit>\d)", expand=False)
- expected = pd.Series(ArrowExtensionArray(pa.array(["1", "2", None])), name="digit")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"])
- def test_duration_from_strings_with_nat(unit):
- # GH51175
- strings = ["1000", "NaT"]
- pa_type = pa.duration(unit)
- result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa_type)
- expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type))
- tm.assert_extension_array_equal(result, expected)
- def test_unsupported_dt(data):
- pa_dtype = data.dtype.pyarrow_dtype
- if not pa.types.is_temporal(pa_dtype):
- with pytest.raises(
- AttributeError, match="Can only use .dt accessor with datetimelike values"
- ):
- pd.Series(data).dt
- @pytest.mark.parametrize(
- "prop, expected",
- [
- ["year", 2023],
- ["day", 2],
- ["day_of_week", 0],
- ["dayofweek", 0],
- ["weekday", 0],
- ["day_of_year", 2],
- ["dayofyear", 2],
- ["hour", 3],
- ["minute", 4],
- ["is_leap_year", False],
- ["microsecond", 5],
- ["month", 1],
- ["nanosecond", 6],
- ["quarter", 1],
- ["second", 7],
- ["date", date(2023, 1, 2)],
- ["time", time(3, 4, 7, 5)],
- ],
- )
- def test_dt_properties(prop, expected):
- ser = pd.Series(
- [
- pd.Timestamp(
- year=2023,
- month=1,
- day=2,
- hour=3,
- minute=4,
- second=7,
- microsecond=5,
- nanosecond=6,
- ),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- result = getattr(ser.dt, prop)
- exp_type = None
- if isinstance(expected, date):
- exp_type = pa.date32()
- elif isinstance(expected, time):
- exp_type = pa.time64("ns")
- expected = pd.Series(ArrowExtensionArray(pa.array([expected, None], type=exp_type)))
- tm.assert_series_equal(result, expected)
- def test_dt_is_month_start_end():
- ser = pd.Series(
- [
- datetime(year=2023, month=12, day=2, hour=3),
- datetime(year=2023, month=1, day=1, hour=3),
- datetime(year=2023, month=3, day=31, hour=3),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("us")),
- )
- result = ser.dt.is_month_start
- expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- result = ser.dt.is_month_end
- expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- def test_dt_is_year_start_end():
- ser = pd.Series(
- [
- datetime(year=2023, month=12, day=31, hour=3),
- datetime(year=2023, month=1, day=1, hour=3),
- datetime(year=2023, month=3, day=31, hour=3),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("us")),
- )
- result = ser.dt.is_year_start
- expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- result = ser.dt.is_year_end
- expected = pd.Series([True, False, False, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- def test_dt_is_quarter_start_end():
- ser = pd.Series(
- [
- datetime(year=2023, month=11, day=30, hour=3),
- datetime(year=2023, month=1, day=1, hour=3),
- datetime(year=2023, month=3, day=31, hour=3),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("us")),
- )
- result = ser.dt.is_quarter_start
- expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- result = ser.dt.is_quarter_end
- expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["days_in_month", "daysinmonth"])
- def test_dt_days_in_month(method):
- ser = pd.Series(
- [
- datetime(year=2023, month=3, day=30, hour=3),
- datetime(year=2023, month=4, day=1, hour=3),
- datetime(year=2023, month=2, day=3, hour=3),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("us")),
- )
- result = getattr(ser.dt, method)
- expected = pd.Series([31, 30, 28, None], dtype=ArrowDtype(pa.int64()))
- tm.assert_series_equal(result, expected)
- def test_dt_normalize():
- ser = pd.Series(
- [
- datetime(year=2023, month=3, day=30),
- datetime(year=2023, month=4, day=1, hour=3),
- datetime(year=2023, month=2, day=3, hour=23, minute=59, second=59),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("us")),
- )
- result = ser.dt.normalize()
- expected = pd.Series(
- [
- datetime(year=2023, month=3, day=30),
- datetime(year=2023, month=4, day=1),
- datetime(year=2023, month=2, day=3),
- None,
- ],
- dtype=ArrowDtype(pa.timestamp("us")),
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("unit", ["us", "ns"])
- def test_dt_time_preserve_unit(unit):
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp(unit)),
- )
- assert ser.dt.unit == unit
- result = ser.dt.time
- expected = pd.Series(
- ArrowExtensionArray(pa.array([time(3, 0), None], type=pa.time64(unit)))
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
- def test_dt_tz(tz):
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns", tz=tz)),
- )
- result = ser.dt.tz
- assert result == timezones.maybe_get_tz(tz)
- def test_dt_isocalendar():
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- result = ser.dt.isocalendar()
- expected = pd.DataFrame(
- [[2023, 1, 1], [0, 0, 0]],
- columns=["year", "week", "day"],
- dtype="int64[pyarrow]",
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "method, exp", [["day_name", "Sunday"], ["month_name", "January"]]
- )
- def test_dt_day_month_name(method, exp, request):
- # GH 52388
- _require_timezone_database(request)
- ser = pd.Series([datetime(2023, 1, 1), None], dtype=ArrowDtype(pa.timestamp("ms")))
- result = getattr(ser.dt, method)()
- expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
- tm.assert_series_equal(result, expected)
- def test_dt_strftime(request):
- _require_timezone_database(request)
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- result = ser.dt.strftime("%Y-%m-%dT%H:%M:%S")
- expected = pd.Series(
- ["2023-01-02T03:00:00.000000000", None], dtype=ArrowDtype(pa.string())
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["ceil", "floor", "round"])
- def test_dt_roundlike_tz_options_not_supported(method):
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- with pytest.raises(NotImplementedError, match="ambiguous is not supported."):
- getattr(ser.dt, method)("1h", ambiguous="NaT")
- with pytest.raises(NotImplementedError, match="nonexistent is not supported."):
- getattr(ser.dt, method)("1h", nonexistent="NaT")
- @pytest.mark.parametrize("method", ["ceil", "floor", "round"])
- def test_dt_roundlike_unsupported_freq(method):
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- with pytest.raises(ValueError, match="freq='1B' is not supported"):
- getattr(ser.dt, method)("1B")
- with pytest.raises(ValueError, match="Must specify a valid frequency: None"):
- getattr(ser.dt, method)(None)
- @pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"])
- @pytest.mark.parametrize("method", ["ceil", "floor", "round"])
- def test_dt_ceil_year_floor(freq, method):
- ser = pd.Series(
- [datetime(year=2023, month=1, day=1), None],
- )
- pa_dtype = ArrowDtype(pa.timestamp("ns"))
- expected = getattr(ser.dt, method)(f"1{freq}").astype(pa_dtype)
- result = getattr(ser.astype(pa_dtype).dt, method)(f"1{freq}")
- tm.assert_series_equal(result, expected)
- def test_dt_to_pydatetime():
- # GH 51859
- data = [datetime(2022, 1, 1), datetime(2023, 1, 1)]
- ser = pd.Series(data, dtype=ArrowDtype(pa.timestamp("ns")))
- msg = "The behavior of ArrowTemporalProperties.to_pydatetime is deprecated"
- with tm.assert_produces_warning(FutureWarning, match=msg):
- result = ser.dt.to_pydatetime()
- expected = np.array(data, dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- assert all(type(res) is datetime for res in result)
- msg = "The behavior of DatetimeProperties.to_pydatetime is deprecated"
- with tm.assert_produces_warning(FutureWarning, match=msg):
- expected = ser.astype("datetime64[ns]").dt.to_pydatetime()
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("date_type", [32, 64])
- def test_dt_to_pydatetime_date_error(date_type):
- # GH 52812
- ser = pd.Series(
- [date(2022, 12, 31)],
- dtype=ArrowDtype(getattr(pa, f"date{date_type}")()),
- )
- msg = "The behavior of ArrowTemporalProperties.to_pydatetime is deprecated"
- with tm.assert_produces_warning(FutureWarning, match=msg):
- with pytest.raises(ValueError, match="to_pydatetime cannot be called with"):
- ser.dt.to_pydatetime()
- def test_dt_tz_localize_unsupported_tz_options():
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- with pytest.raises(NotImplementedError, match="ambiguous='NaT' is not supported"):
- ser.dt.tz_localize("UTC", ambiguous="NaT")
- with pytest.raises(NotImplementedError, match="nonexistent='NaT' is not supported"):
- ser.dt.tz_localize("UTC", nonexistent="NaT")
- def test_dt_tz_localize_none():
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns", tz="US/Pacific")),
- )
- result = ser.dt.tz_localize(None)
- expected = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("unit", ["us", "ns"])
- def test_dt_tz_localize(unit, request):
- _require_timezone_database(request)
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp(unit)),
- )
- result = ser.dt.tz_localize("US/Pacific")
- exp_data = pa.array(
- [datetime(year=2023, month=1, day=2, hour=3), None], type=pa.timestamp(unit)
- )
- exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific")
- expected = pd.Series(ArrowExtensionArray(exp_data))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "nonexistent, exp_date",
- [
- ["shift_forward", datetime(year=2023, month=3, day=12, hour=3)],
- ["shift_backward", pd.Timestamp("2023-03-12 01:59:59.999999999")],
- ],
- )
- def test_dt_tz_localize_nonexistent(nonexistent, exp_date, request):
- _require_timezone_database(request)
- ser = pd.Series(
- [datetime(year=2023, month=3, day=12, hour=2, minute=30), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- result = ser.dt.tz_localize("US/Pacific", nonexistent=nonexistent)
- exp_data = pa.array([exp_date, None], type=pa.timestamp("ns"))
- exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific")
- expected = pd.Series(ArrowExtensionArray(exp_data))
- tm.assert_series_equal(result, expected)
- def test_dt_tz_convert_not_tz_raises():
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- with pytest.raises(TypeError, match="Cannot convert tz-naive timestamps"):
- ser.dt.tz_convert("UTC")
- def test_dt_tz_convert_none():
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns", "US/Pacific")),
- )
- result = ser.dt.tz_convert(None)
- expected = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp("ns")),
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("unit", ["us", "ns"])
- def test_dt_tz_convert(unit):
- ser = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")),
- )
- result = ser.dt.tz_convert("US/Eastern")
- expected = pd.Series(
- [datetime(year=2023, month=1, day=2, hour=3), None],
- dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")),
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"])
- def test_as_unit(dtype):
- # GH 52284
- ser = pd.Series([1000, None], dtype=dtype)
- result = ser.dt.as_unit("ns")
- expected = ser.astype(dtype.replace("ms", "ns"))
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "prop, expected",
- [
- ["days", 1],
- ["seconds", 2],
- ["microseconds", 3],
- ["nanoseconds", 4],
- ],
- )
- def test_dt_timedelta_properties(prop, expected):
- # GH 52284
- ser = pd.Series(
- [
- pd.Timedelta(
- days=1,
- seconds=2,
- microseconds=3,
- nanoseconds=4,
- ),
- None,
- ],
- dtype=ArrowDtype(pa.duration("ns")),
- )
- result = getattr(ser.dt, prop)
- expected = pd.Series(
- ArrowExtensionArray(pa.array([expected, None], type=pa.int32()))
- )
- tm.assert_series_equal(result, expected)
- def test_dt_timedelta_total_seconds():
- # GH 52284
- ser = pd.Series(
- [
- pd.Timedelta(
- days=1,
- seconds=2,
- microseconds=3,
- nanoseconds=4,
- ),
- None,
- ],
- dtype=ArrowDtype(pa.duration("ns")),
- )
- result = ser.dt.total_seconds()
- expected = pd.Series(
- ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64()))
- )
- tm.assert_series_equal(result, expected)
- def test_dt_to_pytimedelta():
- # GH 52284
- data = [timedelta(1, 2, 3), timedelta(1, 2, 4)]
- ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns")))
- result = ser.dt.to_pytimedelta()
- expected = np.array(data, dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- assert all(type(res) is timedelta for res in result)
- expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta()
- tm.assert_numpy_array_equal(result, expected)
- def test_dt_components():
- # GH 52284
- ser = pd.Series(
- [
- pd.Timedelta(
- days=1,
- seconds=2,
- microseconds=3,
- nanoseconds=4,
- ),
- None,
- ],
- dtype=ArrowDtype(pa.duration("ns")),
- )
- result = ser.dt.components
- expected = pd.DataFrame(
- [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
- columns=[
- "days",
- "hours",
- "minutes",
- "seconds",
- "milliseconds",
- "microseconds",
- "nanoseconds",
- ],
- dtype="int32[pyarrow]",
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("skipna", [True, False])
- def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
- # GH51624
- ser = pd.Series([None], dtype="float64[pyarrow]")
- result = getattr(ser, all_boolean_reductions)(skipna=skipna)
- if skipna:
- expected = all_boolean_reductions == "all"
- else:
- expected = pd.NA
- assert result is expected
- def test_from_sequence_of_strings_boolean():
- true_strings = ["true", "TRUE", "True", "1", "1.0"]
- false_strings = ["false", "FALSE", "False", "0", "0.0"]
- nulls = [None]
- strings = true_strings + false_strings + nulls
- bools = (
- [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls)
- )
- result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_())
- expected = pd.array(bools, dtype="boolean[pyarrow]")
- tm.assert_extension_array_equal(result, expected)
- strings = ["True", "foo"]
- with pytest.raises(pa.ArrowInvalid, match="Failed to parse"):
- ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_())
- def test_concat_empty_arrow_backed_series(dtype):
- # GH#51734
- ser = pd.Series([], dtype=dtype)
- expected = ser.copy()
- result = pd.concat([ser[np.array([], dtype=np.bool_)]])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["string", "string[pyarrow]"])
- def test_series_from_string_array(dtype):
- arr = pa.array("the quick brown fox".split())
- ser = pd.Series(arr, dtype=dtype)
- expected = pd.Series(ArrowExtensionArray(arr), dtype=dtype)
- tm.assert_series_equal(ser, expected)
- # _data was renamed to _pa_data
- class OldArrowExtensionArray(ArrowExtensionArray):
- def __getstate__(self):
- state = super().__getstate__()
- state["_data"] = state.pop("_pa_array")
- return state
- def test_pickle_old_arrowextensionarray():
- data = pa.array([1])
- expected = OldArrowExtensionArray(data)
- result = pickle.loads(pickle.dumps(expected))
- tm.assert_extension_array_equal(result, expected)
- assert result._pa_array == pa.chunked_array(data)
- assert not hasattr(result, "_data")
- def test_setitem_boolean_replace_with_mask_segfault():
- # GH#52059
- N = 145_000
- arr = ArrowExtensionArray(pa.chunked_array([np.ones((N,), dtype=np.bool_)]))
- expected = arr.copy()
- arr[np.zeros((N,), dtype=np.bool_)] = False
- assert arr._pa_array == expected._pa_array
- @pytest.mark.parametrize(
- "data, arrow_dtype",
- [
- ([b"a", b"b"], pa.large_binary()),
- (["a", "b"], pa.large_string()),
- ],
- )
- def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype):
- dtype = ArrowDtype(arrow_dtype)
- result = pd.array(np.array(data), dtype=dtype)
- expected = pd.array(data, dtype=dtype)
- tm.assert_extension_array_equal(result, expected)
- def test_concat_null_array():
- df = pd.DataFrame({"a": [None, None]}, dtype=ArrowDtype(pa.null()))
- df2 = pd.DataFrame({"a": [0, 1]}, dtype="int64[pyarrow]")
- result = pd.concat([df, df2], ignore_index=True)
- expected = pd.DataFrame({"a": [None, None, 0, 1]}, dtype="int64[pyarrow]")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES)
- def test_describe_numeric_data(pa_type):
- # GH 52470
- data = pd.Series([1, 2, 3], dtype=ArrowDtype(pa_type))
- result = data.describe()
- expected = pd.Series(
- [3, 2, 1, 1, 1.5, 2.0, 2.5, 3],
- dtype=ArrowDtype(pa.float64()),
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
- def test_describe_timedelta_data(pa_type):
- # GH53001
- data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
- result = data.describe()
- expected = pd.Series(
- [9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
- dtype=object,
- index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
- def test_describe_datetime_data(pa_type):
- # GH53001
- data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
- result = data.describe()
- expected = pd.Series(
- [9]
- + [
- pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
- for v in [5, 1, 3, 5, 7, 9]
- ],
- dtype=object,
- index=["count", "mean", "min", "25%", "50%", "75%", "max"],
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_quantile_temporal(pa_type):
- # GH52678
- data = [1, 2, 3]
- ser = pd.Series(data, dtype=ArrowDtype(pa_type))
- result = ser.quantile(0.1)
- expected = ser[0]
- assert result == expected
- def test_date32_repr():
- # GH48238
- arrow_dt = pa.array([date.fromisoformat("2020-01-01")], type=pa.date32())
- ser = pd.Series(arrow_dt, dtype=ArrowDtype(arrow_dt.type))
- assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]"
- def test_duration_overflow_from_ndarray_containing_nat():
- # GH52843
- data_ts = pd.to_datetime([1, None])
- data_td = pd.to_timedelta([1, None])
- ser_ts = pd.Series(data_ts, dtype=ArrowDtype(pa.timestamp("ns")))
- ser_td = pd.Series(data_td, dtype=ArrowDtype(pa.duration("ns")))
- result = ser_ts + ser_td
- expected = pd.Series([2, None], dtype=ArrowDtype(pa.timestamp("ns")))
- tm.assert_series_equal(result, expected)
- def test_infer_dtype_pyarrow_dtype(data, request):
- res = lib.infer_dtype(data)
- assert res != "unknown-array"
- if data._hasna and res in ["floating", "datetime64", "timedelta64"]:
- mark = pytest.mark.xfail(
- reason="in infer_dtype pd.NA is not ignored in these cases "
- "even with skipna=True in the list(data) check below"
- )
- request.applymarker(mark)
- assert res == lib.infer_dtype(list(data), skipna=True)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_from_sequence_temporal(pa_type):
- # GH 53171
- val = 3
- unit = pa_type.unit
- if pa.types.is_duration(pa_type):
- seq = [pd.Timedelta(val, unit=unit).as_unit(unit)]
- else:
- seq = [pd.Timestamp(val, unit=unit, tz=pa_type.tz).as_unit(unit)]
- result = ArrowExtensionArray._from_sequence(seq, dtype=pa_type)
- expected = ArrowExtensionArray(pa.array([val], type=pa_type))
- tm.assert_extension_array_equal(result, expected)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_setitem_temporal(pa_type):
- # GH 53171
- unit = pa_type.unit
- if pa.types.is_duration(pa_type):
- val = pd.Timedelta(1, unit=unit).as_unit(unit)
- else:
- val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
- arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
- result = arr.copy()
- result[:] = val
- expected = ArrowExtensionArray(pa.array([1, 1, 1], type=pa_type))
- tm.assert_extension_array_equal(result, expected)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_arithmetic_temporal(pa_type, request):
- # GH 53171
- arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
- unit = pa_type.unit
- result = arr - pd.Timedelta(1, unit=unit).as_unit(unit)
- expected = ArrowExtensionArray(pa.array([0, 1, 2], type=pa_type))
- tm.assert_extension_array_equal(result, expected)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_comparison_temporal(pa_type):
- # GH 53171
- unit = pa_type.unit
- if pa.types.is_duration(pa_type):
- val = pd.Timedelta(1, unit=unit).as_unit(unit)
- else:
- val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
- arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
- result = arr > val
- expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_()))
- tm.assert_extension_array_equal(result, expected)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_getitem_temporal(pa_type):
- # GH 53326
- arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
- result = arr[1]
- if pa.types.is_duration(pa_type):
- expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit)
- assert isinstance(result, pd.Timedelta)
- else:
- expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit(
- pa_type.unit
- )
- assert isinstance(result, pd.Timestamp)
- assert result.unit == expected.unit
- assert result == expected
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
- )
- def test_iter_temporal(pa_type):
- # GH 53326
- arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
- result = list(arr)
- if pa.types.is_duration(pa_type):
- expected = [
- pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
- pd.NA,
- ]
- assert isinstance(result[0], pd.Timedelta)
- else:
- expected = [
- pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
- pd.NA,
- ]
- assert isinstance(result[0], pd.Timestamp)
- assert result[0].unit == expected[0].unit
- assert result == expected
- def test_groupby_series_size_returns_pa_int(data):
- # GH 54132
- ser = pd.Series(data[:3], index=["a", "a", "b"])
- result = ser.groupby(level=0).size()
- expected = pd.Series([2, 1], dtype="int64[pyarrow]", index=["a", "b"])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES, ids=repr
- )
- @pytest.mark.parametrize("dtype", [None, object])
- def test_to_numpy_temporal(pa_type, dtype):
- # GH 53326
- # GH 55997: Return datetime64/timedelta64 types with NaT if possible
- arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
- result = arr.to_numpy(dtype=dtype)
- if pa.types.is_duration(pa_type):
- value = pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit)
- else:
- value = pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit)
- if dtype == object or (pa.types.is_timestamp(pa_type) and pa_type.tz is not None):
- if dtype == object:
- na = pd.NA
- else:
- na = pd.NaT
- expected = np.array([value, na], dtype=object)
- assert result[0].unit == value.unit
- else:
- na = pa_type.to_pandas_dtype().type("nat", pa_type.unit)
- value = value.to_numpy()
- expected = np.array([value, na])
- assert np.datetime_data(result[0])[0] == pa_type.unit
- tm.assert_numpy_array_equal(result, expected)
- def test_groupby_count_return_arrow_dtype(data_missing):
- df = pd.DataFrame({"A": [1, 1], "B": data_missing, "C": data_missing})
- result = df.groupby("A").count()
- expected = pd.DataFrame(
- [[1, 1]],
- index=pd.Index([1], name="A"),
- columns=["B", "C"],
- dtype="int64[pyarrow]",
- )
- tm.assert_frame_equal(result, expected)
- def test_fixed_size_list():
- # GH#55000
- ser = pd.Series(
- [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
- )
- result = ser.dtype.type
- assert result == list
- def test_arrowextensiondtype_dataframe_repr():
- # GH 54062
- df = pd.DataFrame(
- pd.period_range("2012", periods=3),
- columns=["col"],
- dtype=ArrowDtype(ArrowPeriodType("D")),
- )
- result = repr(df)
- # TODO: repr value may not be expected; address how
- # pyarrow.ExtensionType values are displayed
- expected = " col\n0 15340\n1 15341\n2 15342"
- assert result == expected
- def test_pow_missing_operand():
- # GH 55512
- k = pd.Series([2, None], dtype="int64[pyarrow]")
- result = k.pow(None, fill_value=3)
- expected = pd.Series([8, None], dtype="int64[pyarrow]")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
- def test_duration_fillna_numpy(pa_type):
- # GH 54707
- ser1 = pd.Series([None, 2], dtype=ArrowDtype(pa_type))
- ser2 = pd.Series(np.array([1, 3], dtype=f"m8[{pa_type.unit}]"))
- result = ser1.fillna(ser2)
- expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type))
- tm.assert_series_equal(result, expected)
- def test_comparison_not_propagating_arrow_error():
- # GH#54944
- a = pd.Series([1 << 63], dtype="uint64[pyarrow]")
- b = pd.Series([None], dtype="int64[pyarrow]")
- with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"):
- a < b
- def test_factorize_chunked_dictionary():
- # GH 54844
- pa_array = pa.chunked_array(
- [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()]
- )
- ser = pd.Series(ArrowExtensionArray(pa_array))
- res_indices, res_uniques = ser.factorize()
- exp_indicies = np.array([0, 1], dtype=np.intp)
- exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks()))
- tm.assert_numpy_array_equal(res_indices, exp_indicies)
- tm.assert_index_equal(res_uniques, exp_uniques)
- def test_dictionary_astype_categorical():
- # GH#56672
- arrs = [
- pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(),
- pa.array(np.array(["a", "d", "c"])).dictionary_encode(),
- ]
- ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs)))
- result = ser.astype("category")
- categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string()))
- expected = pd.Series(
- ["a", "x", "c", "a", "a", "d", "c"],
- dtype=pd.CategoricalDtype(categories=categories),
- )
- tm.assert_series_equal(result, expected)
- def test_arrow_floordiv():
- # GH 55561
- a = pd.Series([-7], dtype="int64[pyarrow]")
- b = pd.Series([4], dtype="int64[pyarrow]")
- expected = pd.Series([-2], dtype="int64[pyarrow]")
- result = a // b
- tm.assert_series_equal(result, expected)
- def test_arrow_floordiv_large_values():
- # GH 56645
- a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
- expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
- result = a // 1_000_000
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
- def test_arrow_floordiv_large_integral_result(dtype):
- # GH 56676
- a = pd.Series([18014398509481983], dtype=dtype)
- result = a // 1
- tm.assert_series_equal(result, a)
- @pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
- def test_arrow_floordiv_larger_divisor(pa_type):
- # GH 56676
- dtype = ArrowDtype(pa_type)
- a = pd.Series([-23], dtype=dtype)
- result = a // 24
- expected = pd.Series([-1], dtype=dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
- def test_arrow_floordiv_integral_invalid(pa_type):
- # GH 56676
- min_value = np.iinfo(pa_type.to_pandas_dtype()).min
- a = pd.Series([min_value], dtype=ArrowDtype(pa_type))
- with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"):
- a // -1
- with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
- a // 0
- @pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR)
- def test_arrow_floordiv_floating_0_divisor(dtype):
- # GH 56676
- a = pd.Series([2], dtype=dtype)
- result = a // 0
- expected = pd.Series([float("inf")], dtype=dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"])
- def test_astype_int_with_null_to_numpy_dtype(dtype):
- # GH 57093
- ser = pd.Series([1, None], dtype="int64[pyarrow]")
- result = ser.astype(dtype)
- expected = pd.Series([1, None], dtype=dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
- def test_arrow_integral_floordiv_large_values(pa_type):
- # GH 56676
- max_value = np.iinfo(pa_type.to_pandas_dtype()).max
- dtype = ArrowDtype(pa_type)
- a = pd.Series([max_value], dtype=dtype)
- b = pd.Series([1], dtype=dtype)
- result = a // b
- tm.assert_series_equal(result, a)
- @pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
- def test_arrow_true_division_large_divisor(dtype):
- # GH 56706
- a = pd.Series([0], dtype=dtype)
- b = pd.Series([18014398509481983], dtype=dtype)
- expected = pd.Series([0], dtype="float64[pyarrow]")
- result = a / b
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
- def test_arrow_floor_division_large_divisor(dtype):
- # GH 56706
- a = pd.Series([0], dtype=dtype)
- b = pd.Series([18014398509481983], dtype=dtype)
- expected = pd.Series([0], dtype=dtype)
- result = a // b
- tm.assert_series_equal(result, expected)
- def test_string_to_datetime_parsing_cast():
- # GH 56266
- string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]
- result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]")
- expected = pd.Series(
- ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True))
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.skipif(
- pa_version_under13p0, reason="pairwise_diff_checked not implemented in pyarrow"
- )
- def test_interpolate_not_numeric(data):
- if not data.dtype._is_numeric:
- ser = pd.Series(data)
- msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype")
- with pytest.raises(TypeError, match=msg):
- pd.Series(data).interpolate()
- def test_string_to_time_parsing_cast():
- # GH 56463
- string_times = ["11:41:43.076160"]
- result = pd.Series(string_times, dtype="time64[us][pyarrow]")
- expected = pd.Series(
- ArrowExtensionArray(pa.array([time(11, 41, 43, 76160)], from_pandas=True))
- )
- tm.assert_series_equal(result, expected)
- def test_to_numpy_float():
- # GH#56267
- ser = pd.Series([32, 40, None], dtype="float[pyarrow]")
- result = ser.astype("float64")
- expected = pd.Series([32, 40, np.nan], dtype="float64")
- tm.assert_series_equal(result, expected)
- def test_to_numpy_timestamp_to_int():
- # GH 55997
- ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]")
- result = ser.to_numpy(dtype=np.int64)
- expected = np.array([1577853000000000000])
- tm.assert_numpy_array_equal(result, expected)
- def test_map_numeric_na_action():
- ser = pd.Series([32, 40, None], dtype="int64[pyarrow]")
- result = ser.map(lambda x: 42, na_action="ignore")
- expected = pd.Series([42.0, 42.0, np.nan], dtype="float64")
- tm.assert_series_equal(result, expected)
|