frame.py 438 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724
  1. """
  2. DataFrame
  3. ---------
  4. An efficient 2D container for potentially mixed-type time series or other
  5. labeled data series.
  6. Similar to its R counterpart, data.frame, except providing automatic data
  7. alignment and a host of useful data manipulation methods having to do with the
  8. labeling information
  9. """
  10. from __future__ import annotations
  11. import collections
  12. from collections import abc
  13. from collections.abc import (
  14. Hashable,
  15. Iterable,
  16. Iterator,
  17. Mapping,
  18. Sequence,
  19. )
  20. import functools
  21. from inspect import signature
  22. from io import StringIO
  23. import itertools
  24. import operator
  25. import sys
  26. from textwrap import dedent
  27. from typing import (
  28. TYPE_CHECKING,
  29. Any,
  30. Callable,
  31. Literal,
  32. cast,
  33. overload,
  34. )
  35. import warnings
  36. import numpy as np
  37. from numpy import ma
  38. from pandas._config import (
  39. get_option,
  40. using_copy_on_write,
  41. warn_copy_on_write,
  42. )
  43. from pandas._config.config import _get_option
  44. from pandas._libs import (
  45. algos as libalgos,
  46. lib,
  47. properties,
  48. )
  49. from pandas._libs.hashtable import duplicated
  50. from pandas._libs.lib import is_range_indexer
  51. from pandas.compat import PYPY
  52. from pandas.compat._constants import (
  53. REF_COUNT,
  54. WARNING_CHECK_DISABLED,
  55. )
  56. from pandas.compat._optional import import_optional_dependency
  57. from pandas.compat.numpy import function as nv
  58. from pandas.errors import (
  59. ChainedAssignmentError,
  60. InvalidIndexError,
  61. _chained_assignment_method_msg,
  62. _chained_assignment_msg,
  63. _chained_assignment_warning_method_msg,
  64. _chained_assignment_warning_msg,
  65. )
  66. from pandas.util._decorators import (
  67. Appender,
  68. Substitution,
  69. deprecate_nonkeyword_arguments,
  70. doc,
  71. )
  72. from pandas.util._exceptions import (
  73. find_stack_level,
  74. rewrite_warning,
  75. )
  76. from pandas.util._validators import (
  77. validate_ascending,
  78. validate_bool_kwarg,
  79. validate_percentile,
  80. )
  81. from pandas.core.dtypes.cast import (
  82. LossySetitemError,
  83. can_hold_element,
  84. construct_1d_arraylike_from_scalar,
  85. construct_2d_arraylike_from_scalar,
  86. find_common_type,
  87. infer_dtype_from_scalar,
  88. invalidate_string_dtypes,
  89. maybe_box_native,
  90. maybe_downcast_to_dtype,
  91. )
  92. from pandas.core.dtypes.common import (
  93. infer_dtype_from_object,
  94. is_1d_only_ea_dtype,
  95. is_array_like,
  96. is_bool_dtype,
  97. is_dataclass,
  98. is_dict_like,
  99. is_float,
  100. is_float_dtype,
  101. is_hashable,
  102. is_integer,
  103. is_integer_dtype,
  104. is_iterator,
  105. is_list_like,
  106. is_scalar,
  107. is_sequence,
  108. needs_i8_conversion,
  109. pandas_dtype,
  110. )
  111. from pandas.core.dtypes.concat import concat_compat
  112. from pandas.core.dtypes.dtypes import (
  113. ArrowDtype,
  114. BaseMaskedDtype,
  115. ExtensionDtype,
  116. )
  117. from pandas.core.dtypes.missing import (
  118. isna,
  119. notna,
  120. )
  121. from pandas.core import (
  122. algorithms,
  123. common as com,
  124. nanops,
  125. ops,
  126. roperator,
  127. )
  128. from pandas.core.accessor import CachedAccessor
  129. from pandas.core.apply import reconstruct_and_relabel_result
  130. from pandas.core.array_algos.take import take_2d_multi
  131. from pandas.core.arraylike import OpsMixin
  132. from pandas.core.arrays import (
  133. BaseMaskedArray,
  134. DatetimeArray,
  135. ExtensionArray,
  136. PeriodArray,
  137. TimedeltaArray,
  138. )
  139. from pandas.core.arrays.sparse import SparseFrameAccessor
  140. from pandas.core.arrays.string_ import StringDtype
  141. from pandas.core.construction import (
  142. ensure_wrapped_if_datetimelike,
  143. sanitize_array,
  144. sanitize_masked_array,
  145. )
  146. from pandas.core.generic import (
  147. NDFrame,
  148. make_doc,
  149. )
  150. from pandas.core.indexers import check_key_length
  151. from pandas.core.indexes.api import (
  152. DatetimeIndex,
  153. Index,
  154. PeriodIndex,
  155. default_index,
  156. ensure_index,
  157. ensure_index_from_sequences,
  158. )
  159. from pandas.core.indexes.multi import (
  160. MultiIndex,
  161. maybe_droplevels,
  162. )
  163. from pandas.core.indexing import (
  164. check_bool_indexer,
  165. check_dict_or_set_indexers,
  166. )
  167. from pandas.core.internals import (
  168. ArrayManager,
  169. BlockManager,
  170. )
  171. from pandas.core.internals.construction import (
  172. arrays_to_mgr,
  173. dataclasses_to_dicts,
  174. dict_to_mgr,
  175. mgr_to_mgr,
  176. ndarray_to_mgr,
  177. nested_data_to_arrays,
  178. rec_array_to_mgr,
  179. reorder_arrays,
  180. to_arrays,
  181. treat_as_nested,
  182. )
  183. from pandas.core.methods import selectn
  184. from pandas.core.reshape.melt import melt
  185. from pandas.core.series import Series
  186. from pandas.core.shared_docs import _shared_docs
  187. from pandas.core.sorting import (
  188. get_group_index,
  189. lexsort_indexer,
  190. nargsort,
  191. )
  192. from pandas.io.common import get_handle
  193. from pandas.io.formats import (
  194. console,
  195. format as fmt,
  196. )
  197. from pandas.io.formats.info import (
  198. INFO_DOCSTRING,
  199. DataFrameInfo,
  200. frame_sub_kwargs,
  201. )
  202. import pandas.plotting
  203. if TYPE_CHECKING:
  204. import datetime
  205. from pandas._libs.internals import BlockValuesRefs
  206. from pandas._typing import (
  207. AggFuncType,
  208. AnyAll,
  209. AnyArrayLike,
  210. ArrayLike,
  211. Axes,
  212. Axis,
  213. AxisInt,
  214. ColspaceArgType,
  215. CompressionOptions,
  216. CorrelationMethod,
  217. DropKeep,
  218. Dtype,
  219. DtypeObj,
  220. FilePath,
  221. FloatFormatType,
  222. FormattersType,
  223. Frequency,
  224. FromDictOrient,
  225. IgnoreRaise,
  226. IndexKeyFunc,
  227. IndexLabel,
  228. JoinValidate,
  229. Level,
  230. MergeHow,
  231. MergeValidate,
  232. MutableMappingT,
  233. NaAction,
  234. NaPosition,
  235. NsmallestNlargestKeep,
  236. PythonFuncType,
  237. QuantileInterpolation,
  238. ReadBuffer,
  239. ReindexMethod,
  240. Renamer,
  241. Scalar,
  242. Self,
  243. SequenceNotStr,
  244. SortKind,
  245. StorageOptions,
  246. Suffixes,
  247. ToGbqIfexist,
  248. ToStataByteorder,
  249. ToTimestampHow,
  250. UpdateJoin,
  251. ValueKeyFunc,
  252. WriteBuffer,
  253. XMLParsers,
  254. npt,
  255. )
  256. from pandas.core.groupby.generic import DataFrameGroupBy
  257. from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
  258. from pandas.core.internals import SingleDataManager
  259. from pandas.io.formats.style import Styler
  260. # ---------------------------------------------------------------------
  261. # Docstring templates
  262. _shared_doc_kwargs = {
  263. "axes": "index, columns",
  264. "klass": "DataFrame",
  265. "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
  266. "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
  267. If 0 or 'index': apply function to each column.
  268. If 1 or 'columns': apply function to each row.""",
  269. "inplace": """
  270. inplace : bool, default False
  271. Whether to modify the DataFrame rather than creating a new one.""",
  272. "optional_by": """
  273. by : str or list of str
  274. Name or list of names to sort by.
  275. - if `axis` is 0 or `'index'` then `by` may contain index
  276. levels and/or column labels.
  277. - if `axis` is 1 or `'columns'` then `by` may contain column
  278. levels and/or index labels.""",
  279. "optional_reindex": """
  280. labels : array-like, optional
  281. New labels / index to conform the axis specified by 'axis' to.
  282. index : array-like, optional
  283. New labels for the index. Preferably an Index object to avoid
  284. duplicating data.
  285. columns : array-like, optional
  286. New labels for the columns. Preferably an Index object to avoid
  287. duplicating data.
  288. axis : int or str, optional
  289. Axis to target. Can be either the axis name ('index', 'columns')
  290. or number (0, 1).""",
  291. }
  292. _merge_doc = """
  293. Merge DataFrame or named Series objects with a database-style join.
  294. A named Series object is treated as a DataFrame with a single named column.
  295. The join is done on columns or indexes. If joining columns on
  296. columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
  297. on indexes or indexes on a column or columns, the index will be passed on.
  298. When performing a cross merge, no column specifications to merge on are
  299. allowed.
  300. .. warning::
  301. If both key columns contain rows where the key is a null value, those
  302. rows will be matched against each other. This is different from usual SQL
  303. join behaviour and can lead to unexpected results.
  304. Parameters
  305. ----------%s
  306. right : DataFrame or named Series
  307. Object to merge with.
  308. how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
  309. Type of merge to be performed.
  310. * left: use only keys from left frame, similar to a SQL left outer join;
  311. preserve key order.
  312. * right: use only keys from right frame, similar to a SQL right outer join;
  313. preserve key order.
  314. * outer: use union of keys from both frames, similar to a SQL full outer
  315. join; sort keys lexicographically.
  316. * inner: use intersection of keys from both frames, similar to a SQL inner
  317. join; preserve the order of the left keys.
  318. * cross: creates the cartesian product from both frames, preserves the order
  319. of the left keys.
  320. on : label or list
  321. Column or index level names to join on. These must be found in both
  322. DataFrames. If `on` is None and not merging on indexes then this defaults
  323. to the intersection of the columns in both DataFrames.
  324. left_on : label or list, or array-like
  325. Column or index level names to join on in the left DataFrame. Can also
  326. be an array or list of arrays of the length of the left DataFrame.
  327. These arrays are treated as if they are columns.
  328. right_on : label or list, or array-like
  329. Column or index level names to join on in the right DataFrame. Can also
  330. be an array or list of arrays of the length of the right DataFrame.
  331. These arrays are treated as if they are columns.
  332. left_index : bool, default False
  333. Use the index from the left DataFrame as the join key(s). If it is a
  334. MultiIndex, the number of keys in the other DataFrame (either the index
  335. or a number of columns) must match the number of levels.
  336. right_index : bool, default False
  337. Use the index from the right DataFrame as the join key. Same caveats as
  338. left_index.
  339. sort : bool, default False
  340. Sort the join keys lexicographically in the result DataFrame. If False,
  341. the order of the join keys depends on the join type (how keyword).
  342. suffixes : list-like, default is ("_x", "_y")
  343. A length-2 sequence where each element is optionally a string
  344. indicating the suffix to add to overlapping column names in
  345. `left` and `right` respectively. Pass a value of `None` instead
  346. of a string to indicate that the column name from `left` or
  347. `right` should be left as-is, with no suffix. At least one of the
  348. values must not be None.
  349. copy : bool, default True
  350. If False, avoid copy if possible.
  351. .. note::
  352. The `copy` keyword will change behavior in pandas 3.0.
  353. `Copy-on-Write
  354. <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
  355. will be enabled by default, which means that all methods with a
  356. `copy` keyword will use a lazy copy mechanism to defer the copy and
  357. ignore the `copy` keyword. The `copy` keyword will be removed in a
  358. future version of pandas.
  359. You can already get the future behavior and improvements through
  360. enabling copy on write ``pd.options.mode.copy_on_write = True``
  361. indicator : bool or str, default False
  362. If True, adds a column to the output DataFrame called "_merge" with
  363. information on the source of each row. The column can be given a different
  364. name by providing a string argument. The column will have a Categorical
  365. type with the value of "left_only" for observations whose merge key only
  366. appears in the left DataFrame, "right_only" for observations
  367. whose merge key only appears in the right DataFrame, and "both"
  368. if the observation's merge key is found in both DataFrames.
  369. validate : str, optional
  370. If specified, checks if merge is of specified type.
  371. * "one_to_one" or "1:1": check if merge keys are unique in both
  372. left and right datasets.
  373. * "one_to_many" or "1:m": check if merge keys are unique in left
  374. dataset.
  375. * "many_to_one" or "m:1": check if merge keys are unique in right
  376. dataset.
  377. * "many_to_many" or "m:m": allowed, but does not result in checks.
  378. Returns
  379. -------
  380. DataFrame
  381. A DataFrame of the two merged objects.
  382. See Also
  383. --------
  384. merge_ordered : Merge with optional filling/interpolation.
  385. merge_asof : Merge on nearest keys.
  386. DataFrame.join : Similar method using indices.
  387. Examples
  388. --------
  389. >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  390. ... 'value': [1, 2, 3, 5]})
  391. >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
  392. ... 'value': [5, 6, 7, 8]})
  393. >>> df1
  394. lkey value
  395. 0 foo 1
  396. 1 bar 2
  397. 2 baz 3
  398. 3 foo 5
  399. >>> df2
  400. rkey value
  401. 0 foo 5
  402. 1 bar 6
  403. 2 baz 7
  404. 3 foo 8
  405. Merge df1 and df2 on the lkey and rkey columns. The value columns have
  406. the default suffixes, _x and _y, appended.
  407. >>> df1.merge(df2, left_on='lkey', right_on='rkey')
  408. lkey value_x rkey value_y
  409. 0 foo 1 foo 5
  410. 1 foo 1 foo 8
  411. 2 bar 2 bar 6
  412. 3 baz 3 baz 7
  413. 4 foo 5 foo 5
  414. 5 foo 5 foo 8
  415. Merge DataFrames df1 and df2 with specified left and right suffixes
  416. appended to any overlapping columns.
  417. >>> df1.merge(df2, left_on='lkey', right_on='rkey',
  418. ... suffixes=('_left', '_right'))
  419. lkey value_left rkey value_right
  420. 0 foo 1 foo 5
  421. 1 foo 1 foo 8
  422. 2 bar 2 bar 6
  423. 3 baz 3 baz 7
  424. 4 foo 5 foo 5
  425. 5 foo 5 foo 8
  426. Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
  427. any overlapping columns.
  428. >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
  429. Traceback (most recent call last):
  430. ...
  431. ValueError: columns overlap but no suffix specified:
  432. Index(['value'], dtype='object')
  433. >>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
  434. >>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
  435. >>> df1
  436. a b
  437. 0 foo 1
  438. 1 bar 2
  439. >>> df2
  440. a c
  441. 0 foo 3
  442. 1 baz 4
  443. >>> df1.merge(df2, how='inner', on='a')
  444. a b c
  445. 0 foo 1 3
  446. >>> df1.merge(df2, how='left', on='a')
  447. a b c
  448. 0 foo 1 3.0
  449. 1 bar 2 NaN
  450. >>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
  451. >>> df2 = pd.DataFrame({'right': [7, 8]})
  452. >>> df1
  453. left
  454. 0 foo
  455. 1 bar
  456. >>> df2
  457. right
  458. 0 7
  459. 1 8
  460. >>> df1.merge(df2, how='cross')
  461. left right
  462. 0 foo 7
  463. 1 foo 8
  464. 2 bar 7
  465. 3 bar 8
  466. """
  467. # -----------------------------------------------------------------------
  468. # DataFrame class
  469. class DataFrame(NDFrame, OpsMixin):
  470. """
  471. Two-dimensional, size-mutable, potentially heterogeneous tabular data.
  472. Data structure also contains labeled axes (rows and columns).
  473. Arithmetic operations align on both row and column labels. Can be
  474. thought of as a dict-like container for Series objects. The primary
  475. pandas data structure.
  476. Parameters
  477. ----------
  478. data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
  479. Dict can contain Series, arrays, constants, dataclass or list-like objects. If
  480. data is a dict, column order follows insertion-order. If a dict contains Series
  481. which have an index defined, it is aligned by its index. This alignment also
  482. occurs if data is a Series or a DataFrame itself. Alignment is done on
  483. Series/DataFrame inputs.
  484. If data is a list of dicts, column order follows insertion-order.
  485. index : Index or array-like
  486. Index to use for resulting frame. Will default to RangeIndex if
  487. no indexing information part of input data and no index provided.
  488. columns : Index or array-like
  489. Column labels to use for resulting frame when data does not have them,
  490. defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
  491. will perform column selection instead.
  492. dtype : dtype, default None
  493. Data type to force. Only a single dtype is allowed. If None, infer.
  494. copy : bool or None, default None
  495. Copy data from inputs.
  496. For dict data, the default of None behaves like ``copy=True``. For DataFrame
  497. or 2d ndarray input, the default of None behaves like ``copy=False``.
  498. If data is a dict containing one or more Series (possibly of different dtypes),
  499. ``copy=False`` will ensure that these inputs are not copied.
  500. .. versionchanged:: 1.3.0
  501. See Also
  502. --------
  503. DataFrame.from_records : Constructor from tuples, also record arrays.
  504. DataFrame.from_dict : From dicts of Series, arrays, or dicts.
  505. read_csv : Read a comma-separated values (csv) file into DataFrame.
  506. read_table : Read general delimited file into DataFrame.
  507. read_clipboard : Read text from clipboard into DataFrame.
  508. Notes
  509. -----
  510. Please reference the :ref:`User Guide <basics.dataframe>` for more information.
  511. Examples
  512. --------
  513. Constructing DataFrame from a dictionary.
  514. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  515. >>> df = pd.DataFrame(data=d)
  516. >>> df
  517. col1 col2
  518. 0 1 3
  519. 1 2 4
  520. Notice that the inferred dtype is int64.
  521. >>> df.dtypes
  522. col1 int64
  523. col2 int64
  524. dtype: object
  525. To enforce a single dtype:
  526. >>> df = pd.DataFrame(data=d, dtype=np.int8)
  527. >>> df.dtypes
  528. col1 int8
  529. col2 int8
  530. dtype: object
  531. Constructing DataFrame from a dictionary including Series:
  532. >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
  533. >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
  534. col1 col2
  535. 0 0 NaN
  536. 1 1 NaN
  537. 2 2 2.0
  538. 3 3 3.0
  539. Constructing DataFrame from numpy ndarray:
  540. >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
  541. ... columns=['a', 'b', 'c'])
  542. >>> df2
  543. a b c
  544. 0 1 2 3
  545. 1 4 5 6
  546. 2 7 8 9
  547. Constructing DataFrame from a numpy ndarray that has labeled columns:
  548. >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
  549. ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
  550. >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
  551. ...
  552. >>> df3
  553. c a
  554. 0 3 1
  555. 1 6 4
  556. 2 9 7
  557. Constructing DataFrame from dataclass:
  558. >>> from dataclasses import make_dataclass
  559. >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
  560. >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
  561. x y
  562. 0 0 0
  563. 1 0 3
  564. 2 2 3
  565. Constructing DataFrame from Series/DataFrame:
  566. >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
  567. >>> df = pd.DataFrame(data=ser, index=["a", "c"])
  568. >>> df
  569. 0
  570. a 1
  571. c 3
  572. >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
  573. >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
  574. >>> df2
  575. x
  576. a 1
  577. c 3
  578. """
  579. _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
  580. _typ = "dataframe"
  581. _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
  582. _accessors: set[str] = {"sparse"}
  583. _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
  584. _mgr: BlockManager | ArrayManager
  585. # similar to __array_priority__, positions DataFrame before Series, Index,
  586. # and ExtensionArray. Should NOT be overridden by subclasses.
  587. __pandas_priority__ = 4000
  588. @property
  589. def _constructor(self) -> Callable[..., DataFrame]:
  590. return DataFrame
  591. def _constructor_from_mgr(self, mgr, axes) -> DataFrame:
  592. df = DataFrame._from_mgr(mgr, axes=axes)
  593. if type(self) is DataFrame:
  594. # This would also work `if self._constructor is DataFrame`, but
  595. # this check is slightly faster, benefiting the most-common case.
  596. return df
  597. elif type(self).__name__ == "GeoDataFrame":
  598. # Shim until geopandas can override their _constructor_from_mgr
  599. # bc they have different behavior for Managers than for DataFrames
  600. return self._constructor(mgr)
  601. # We assume that the subclass __init__ knows how to handle a
  602. # pd.DataFrame object.
  603. return self._constructor(df)
  604. _constructor_sliced: Callable[..., Series] = Series
  605. def _constructor_sliced_from_mgr(self, mgr, axes) -> Series:
  606. ser = Series._from_mgr(mgr, axes)
  607. ser._name = None # caller is responsible for setting real name
  608. if type(self) is DataFrame:
  609. # This would also work `if self._constructor_sliced is Series`, but
  610. # this check is slightly faster, benefiting the most-common case.
  611. return ser
  612. # We assume that the subclass __init__ knows how to handle a
  613. # pd.Series object.
  614. return self._constructor_sliced(ser)
  615. # ----------------------------------------------------------------------
  616. # Constructors
  617. def __init__(
  618. self,
  619. data=None,
  620. index: Axes | None = None,
  621. columns: Axes | None = None,
  622. dtype: Dtype | None = None,
  623. copy: bool | None = None,
  624. ) -> None:
  625. allow_mgr = False
  626. if dtype is not None:
  627. dtype = self._validate_dtype(dtype)
  628. if isinstance(data, DataFrame):
  629. data = data._mgr
  630. allow_mgr = True
  631. if not copy:
  632. # if not copying data, ensure to still return a shallow copy
  633. # to avoid the result sharing the same Manager
  634. data = data.copy(deep=False)
  635. if isinstance(data, (BlockManager, ArrayManager)):
  636. if not allow_mgr:
  637. # GH#52419
  638. warnings.warn(
  639. f"Passing a {type(data).__name__} to {type(self).__name__} "
  640. "is deprecated and will raise in a future version. "
  641. "Use public APIs instead.",
  642. DeprecationWarning,
  643. stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix
  644. )
  645. if using_copy_on_write():
  646. data = data.copy(deep=False)
  647. # first check if a Manager is passed without any other arguments
  648. # -> use fastpath (without checking Manager type)
  649. if index is None and columns is None and dtype is None and not copy:
  650. # GH#33357 fastpath
  651. NDFrame.__init__(self, data)
  652. return
  653. manager = _get_option("mode.data_manager", silent=True)
  654. is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
  655. data_dtype = getattr(data, "dtype", None)
  656. original_dtype = dtype
  657. # GH47215
  658. if isinstance(index, set):
  659. raise ValueError("index cannot be a set")
  660. if isinstance(columns, set):
  661. raise ValueError("columns cannot be a set")
  662. if copy is None:
  663. if isinstance(data, dict):
  664. # retain pre-GH#38939 default behavior
  665. copy = True
  666. elif (
  667. manager == "array"
  668. and isinstance(data, (np.ndarray, ExtensionArray))
  669. and data.ndim == 2
  670. ):
  671. # INFO(ArrayManager) by default copy the 2D input array to get
  672. # contiguous 1D arrays
  673. copy = True
  674. elif using_copy_on_write() and not isinstance(
  675. data, (Index, DataFrame, Series)
  676. ):
  677. copy = True
  678. else:
  679. copy = False
  680. if data is None:
  681. index = index if index is not None else default_index(0)
  682. columns = columns if columns is not None else default_index(0)
  683. dtype = dtype if dtype is not None else pandas_dtype(object)
  684. data = []
  685. if isinstance(data, (BlockManager, ArrayManager)):
  686. mgr = self._init_mgr(
  687. data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
  688. )
  689. elif isinstance(data, dict):
  690. # GH#38939 de facto copy defaults to False only in non-dict cases
  691. mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
  692. elif isinstance(data, ma.MaskedArray):
  693. from numpy.ma import mrecords
  694. # masked recarray
  695. if isinstance(data, mrecords.MaskedRecords):
  696. raise TypeError(
  697. "MaskedRecords are not supported. Pass "
  698. "{name: data[name] for name in data.dtype.names} "
  699. "instead"
  700. )
  701. # a masked array
  702. data = sanitize_masked_array(data)
  703. mgr = ndarray_to_mgr(
  704. data,
  705. index,
  706. columns,
  707. dtype=dtype,
  708. copy=copy,
  709. typ=manager,
  710. )
  711. elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
  712. if data.dtype.names:
  713. # i.e. numpy structured array
  714. data = cast(np.ndarray, data)
  715. mgr = rec_array_to_mgr(
  716. data,
  717. index,
  718. columns,
  719. dtype,
  720. copy,
  721. typ=manager,
  722. )
  723. elif getattr(data, "name", None) is not None:
  724. # i.e. Series/Index with non-None name
  725. _copy = copy if using_copy_on_write() else True
  726. mgr = dict_to_mgr(
  727. # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
  728. # attribute "name"
  729. {data.name: data}, # type: ignore[union-attr]
  730. index,
  731. columns,
  732. dtype=dtype,
  733. typ=manager,
  734. copy=_copy,
  735. )
  736. else:
  737. mgr = ndarray_to_mgr(
  738. data,
  739. index,
  740. columns,
  741. dtype=dtype,
  742. copy=copy,
  743. typ=manager,
  744. )
  745. # For data is list-like, or Iterable (will consume into list)
  746. elif is_list_like(data):
  747. if not isinstance(data, abc.Sequence):
  748. if hasattr(data, "__array__"):
  749. # GH#44616 big perf improvement for e.g. pytorch tensor
  750. data = np.asarray(data)
  751. else:
  752. data = list(data)
  753. if len(data) > 0:
  754. if is_dataclass(data[0]):
  755. data = dataclasses_to_dicts(data)
  756. if not isinstance(data, np.ndarray) and treat_as_nested(data):
  757. # exclude ndarray as we may have cast it a few lines above
  758. if columns is not None:
  759. columns = ensure_index(columns)
  760. arrays, columns, index = nested_data_to_arrays(
  761. # error: Argument 3 to "nested_data_to_arrays" has incompatible
  762. # type "Optional[Collection[Any]]"; expected "Optional[Index]"
  763. data,
  764. columns,
  765. index, # type: ignore[arg-type]
  766. dtype,
  767. )
  768. mgr = arrays_to_mgr(
  769. arrays,
  770. columns,
  771. index,
  772. dtype=dtype,
  773. typ=manager,
  774. )
  775. else:
  776. mgr = ndarray_to_mgr(
  777. data,
  778. index,
  779. columns,
  780. dtype=dtype,
  781. copy=copy,
  782. typ=manager,
  783. )
  784. else:
  785. mgr = dict_to_mgr(
  786. {},
  787. index,
  788. columns if columns is not None else default_index(0),
  789. dtype=dtype,
  790. typ=manager,
  791. )
  792. # For data is scalar
  793. else:
  794. if index is None or columns is None:
  795. raise ValueError("DataFrame constructor not properly called!")
  796. index = ensure_index(index)
  797. columns = ensure_index(columns)
  798. if not dtype:
  799. dtype, _ = infer_dtype_from_scalar(data)
  800. # For data is a scalar extension dtype
  801. if isinstance(dtype, ExtensionDtype):
  802. # TODO(EA2D): special case not needed with 2D EAs
  803. values = [
  804. construct_1d_arraylike_from_scalar(data, len(index), dtype)
  805. for _ in range(len(columns))
  806. ]
  807. mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
  808. else:
  809. arr2d = construct_2d_arraylike_from_scalar(
  810. data,
  811. len(index),
  812. len(columns),
  813. dtype,
  814. copy,
  815. )
  816. mgr = ndarray_to_mgr(
  817. arr2d,
  818. index,
  819. columns,
  820. dtype=arr2d.dtype,
  821. copy=False,
  822. typ=manager,
  823. )
  824. # ensure correct Manager type according to settings
  825. mgr = mgr_to_mgr(mgr, typ=manager)
  826. NDFrame.__init__(self, mgr)
  827. if original_dtype is None and is_pandas_object and data_dtype == np.object_:
  828. if self.dtypes.iloc[0] != data_dtype:
  829. warnings.warn(
  830. "Dtype inference on a pandas object "
  831. "(Series, Index, ExtensionArray) is deprecated. The DataFrame "
  832. "constructor will keep the original dtype in the future. "
  833. "Call `infer_objects` on the result to get the old "
  834. "behavior.",
  835. FutureWarning,
  836. stacklevel=2,
  837. )
  838. # ----------------------------------------------------------------------
  839. def __dataframe__(
  840. self, nan_as_null: bool = False, allow_copy: bool = True
  841. ) -> DataFrameXchg:
  842. """
  843. Return the dataframe interchange object implementing the interchange protocol.
  844. Parameters
  845. ----------
  846. nan_as_null : bool, default False
  847. `nan_as_null` is DEPRECATED and has no effect. Please avoid using
  848. it; it will be removed in a future release.
  849. allow_copy : bool, default True
  850. Whether to allow memory copying when exporting. If set to False
  851. it would cause non-zero-copy exports to fail.
  852. Returns
  853. -------
  854. DataFrame interchange object
  855. The object which consuming library can use to ingress the dataframe.
  856. Notes
  857. -----
  858. Details on the interchange protocol:
  859. https://data-apis.org/dataframe-protocol/latest/index.html
  860. Examples
  861. --------
  862. >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
  863. >>> interchange_object = df_not_necessarily_pandas.__dataframe__()
  864. >>> interchange_object.column_names()
  865. Index(['A', 'B'], dtype='object')
  866. >>> df_pandas = (pd.api.interchange.from_dataframe
  867. ... (interchange_object.select_columns_by_name(['A'])))
  868. >>> df_pandas
  869. A
  870. 0 1
  871. 1 2
  872. These methods (``column_names``, ``select_columns_by_name``) should work
  873. for any dataframe library which implements the interchange protocol.
  874. """
  875. from pandas.core.interchange.dataframe import PandasDataFrameXchg
  876. return PandasDataFrameXchg(self, allow_copy=allow_copy)
  877. def __dataframe_consortium_standard__(
  878. self, *, api_version: str | None = None
  879. ) -> Any:
  880. """
  881. Provide entry point to the Consortium DataFrame Standard API.
  882. This is developed and maintained outside of pandas.
  883. Please report any issues to https://github.com/data-apis/dataframe-api-compat.
  884. """
  885. dataframe_api_compat = import_optional_dependency("dataframe_api_compat")
  886. convert_to_standard_compliant_dataframe = (
  887. dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe
  888. )
  889. return convert_to_standard_compliant_dataframe(self, api_version=api_version)
  890. def __arrow_c_stream__(self, requested_schema=None):
  891. """
  892. Export the pandas DataFrame as an Arrow C stream PyCapsule.
  893. This relies on pyarrow to convert the pandas DataFrame to the Arrow
  894. format (and follows the default behaviour of ``pyarrow.Table.from_pandas``
  895. in its handling of the index, i.e. store the index as a column except
  896. for RangeIndex).
  897. This conversion is not necessarily zero-copy.
  898. Parameters
  899. ----------
  900. requested_schema : PyCapsule, default None
  901. The schema to which the dataframe should be casted, passed as a
  902. PyCapsule containing a C ArrowSchema representation of the
  903. requested schema.
  904. Returns
  905. -------
  906. PyCapsule
  907. """
  908. pa = import_optional_dependency("pyarrow", min_version="14.0.0")
  909. if requested_schema is not None:
  910. requested_schema = pa.Schema._import_from_c_capsule(requested_schema)
  911. table = pa.Table.from_pandas(self, schema=requested_schema)
  912. return table.__arrow_c_stream__()
  913. # ----------------------------------------------------------------------
  914. @property
  915. def axes(self) -> list[Index]:
  916. """
  917. Return a list representing the axes of the DataFrame.
  918. It has the row axis labels and column axis labels as the only members.
  919. They are returned in that order.
  920. Examples
  921. --------
  922. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  923. >>> df.axes
  924. [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
  925. dtype='object')]
  926. """
  927. return [self.index, self.columns]
  928. @property
  929. def shape(self) -> tuple[int, int]:
  930. """
  931. Return a tuple representing the dimensionality of the DataFrame.
  932. See Also
  933. --------
  934. ndarray.shape : Tuple of array dimensions.
  935. Examples
  936. --------
  937. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  938. >>> df.shape
  939. (2, 2)
  940. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
  941. ... 'col3': [5, 6]})
  942. >>> df.shape
  943. (2, 3)
  944. """
  945. return len(self.index), len(self.columns)
  946. @property
  947. def _is_homogeneous_type(self) -> bool:
  948. """
  949. Whether all the columns in a DataFrame have the same type.
  950. Returns
  951. -------
  952. bool
  953. Examples
  954. --------
  955. >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
  956. True
  957. >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
  958. False
  959. Items with the same type but different sizes are considered
  960. different types.
  961. >>> DataFrame({
  962. ... "A": np.array([1, 2], dtype=np.int32),
  963. ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
  964. False
  965. """
  966. # The "<" part of "<=" here is for empty DataFrame cases
  967. return len({arr.dtype for arr in self._mgr.arrays}) <= 1
  968. @property
  969. def _can_fast_transpose(self) -> bool:
  970. """
  971. Can we transpose this DataFrame without creating any new array objects.
  972. """
  973. if isinstance(self._mgr, ArrayManager):
  974. return False
  975. blocks = self._mgr.blocks
  976. if len(blocks) != 1:
  977. return False
  978. dtype = blocks[0].dtype
  979. # TODO(EA2D) special case would be unnecessary with 2D EAs
  980. return not is_1d_only_ea_dtype(dtype)
  981. @property
  982. def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
  983. """
  984. Analogue to ._values that may return a 2D ExtensionArray.
  985. """
  986. mgr = self._mgr
  987. if isinstance(mgr, ArrayManager):
  988. if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
  989. # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
  990. # has no attribute "reshape"
  991. return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
  992. return ensure_wrapped_if_datetimelike(self.values)
  993. blocks = mgr.blocks
  994. if len(blocks) != 1:
  995. return ensure_wrapped_if_datetimelike(self.values)
  996. arr = blocks[0].values
  997. if arr.ndim == 1:
  998. # non-2D ExtensionArray
  999. return self.values
  1000. # more generally, whatever we allow in NDArrayBackedExtensionBlock
  1001. arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
  1002. return arr.T
  1003. # ----------------------------------------------------------------------
  1004. # Rendering Methods
  1005. def _repr_fits_vertical_(self) -> bool:
  1006. """
  1007. Check length against max_rows.
  1008. """
  1009. max_rows = get_option("display.max_rows")
  1010. return len(self) <= max_rows
  1011. def _repr_fits_horizontal_(self) -> bool:
  1012. """
  1013. Check if full repr fits in horizontal boundaries imposed by the display
  1014. options width and max_columns.
  1015. """
  1016. width, height = console.get_console_size()
  1017. max_columns = get_option("display.max_columns")
  1018. nb_columns = len(self.columns)
  1019. # exceed max columns
  1020. if (max_columns and nb_columns > max_columns) or (
  1021. width and nb_columns > (width // 2)
  1022. ):
  1023. return False
  1024. # used by repr_html under IPython notebook or scripts ignore terminal
  1025. # dims
  1026. if width is None or not console.in_interactive_session():
  1027. return True
  1028. if get_option("display.width") is not None or console.in_ipython_frontend():
  1029. # check at least the column row for excessive width
  1030. max_rows = 1
  1031. else:
  1032. max_rows = get_option("display.max_rows")
  1033. # when auto-detecting, so width=None and not in ipython front end
  1034. # check whether repr fits horizontal by actually checking
  1035. # the width of the rendered repr
  1036. buf = StringIO()
  1037. # only care about the stuff we'll actually print out
  1038. # and to_string on entire frame may be expensive
  1039. d = self
  1040. if max_rows is not None: # unlimited rows
  1041. # min of two, where one may be None
  1042. d = d.iloc[: min(max_rows, len(d))]
  1043. else:
  1044. return True
  1045. d.to_string(buf=buf)
  1046. value = buf.getvalue()
  1047. repr_width = max(len(line) for line in value.split("\n"))
  1048. return repr_width < width
  1049. def _info_repr(self) -> bool:
  1050. """
  1051. True if the repr should show the info view.
  1052. """
  1053. info_repr_option = get_option("display.large_repr") == "info"
  1054. return info_repr_option and not (
  1055. self._repr_fits_horizontal_() and self._repr_fits_vertical_()
  1056. )
  1057. def __repr__(self) -> str:
  1058. """
  1059. Return a string representation for a particular DataFrame.
  1060. """
  1061. if self._info_repr():
  1062. buf = StringIO()
  1063. self.info(buf=buf)
  1064. return buf.getvalue()
  1065. repr_params = fmt.get_dataframe_repr_params()
  1066. return self.to_string(**repr_params)
  1067. def _repr_html_(self) -> str | None:
  1068. """
  1069. Return a html representation for a particular DataFrame.
  1070. Mainly for IPython notebook.
  1071. """
  1072. if self._info_repr():
  1073. buf = StringIO()
  1074. self.info(buf=buf)
  1075. # need to escape the <class>, should be the first line.
  1076. val = buf.getvalue().replace("<", r"&lt;", 1)
  1077. val = val.replace(">", r"&gt;", 1)
  1078. return f"<pre>{val}</pre>"
  1079. if get_option("display.notebook_repr_html"):
  1080. max_rows = get_option("display.max_rows")
  1081. min_rows = get_option("display.min_rows")
  1082. max_cols = get_option("display.max_columns")
  1083. show_dimensions = get_option("display.show_dimensions")
  1084. formatter = fmt.DataFrameFormatter(
  1085. self,
  1086. columns=None,
  1087. col_space=None,
  1088. na_rep="NaN",
  1089. formatters=None,
  1090. float_format=None,
  1091. sparsify=None,
  1092. justify=None,
  1093. index_names=True,
  1094. header=True,
  1095. index=True,
  1096. bold_rows=True,
  1097. escape=True,
  1098. max_rows=max_rows,
  1099. min_rows=min_rows,
  1100. max_cols=max_cols,
  1101. show_dimensions=show_dimensions,
  1102. decimal=".",
  1103. )
  1104. return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
  1105. else:
  1106. return None
  1107. @overload
  1108. def to_string(
  1109. self,
  1110. buf: None = ...,
  1111. columns: Axes | None = ...,
  1112. col_space: int | list[int] | dict[Hashable, int] | None = ...,
  1113. header: bool | SequenceNotStr[str] = ...,
  1114. index: bool = ...,
  1115. na_rep: str = ...,
  1116. formatters: fmt.FormattersType | None = ...,
  1117. float_format: fmt.FloatFormatType | None = ...,
  1118. sparsify: bool | None = ...,
  1119. index_names: bool = ...,
  1120. justify: str | None = ...,
  1121. max_rows: int | None = ...,
  1122. max_cols: int | None = ...,
  1123. show_dimensions: bool = ...,
  1124. decimal: str = ...,
  1125. line_width: int | None = ...,
  1126. min_rows: int | None = ...,
  1127. max_colwidth: int | None = ...,
  1128. encoding: str | None = ...,
  1129. ) -> str:
  1130. ...
  1131. @overload
  1132. def to_string(
  1133. self,
  1134. buf: FilePath | WriteBuffer[str],
  1135. columns: Axes | None = ...,
  1136. col_space: int | list[int] | dict[Hashable, int] | None = ...,
  1137. header: bool | SequenceNotStr[str] = ...,
  1138. index: bool = ...,
  1139. na_rep: str = ...,
  1140. formatters: fmt.FormattersType | None = ...,
  1141. float_format: fmt.FloatFormatType | None = ...,
  1142. sparsify: bool | None = ...,
  1143. index_names: bool = ...,
  1144. justify: str | None = ...,
  1145. max_rows: int | None = ...,
  1146. max_cols: int | None = ...,
  1147. show_dimensions: bool = ...,
  1148. decimal: str = ...,
  1149. line_width: int | None = ...,
  1150. min_rows: int | None = ...,
  1151. max_colwidth: int | None = ...,
  1152. encoding: str | None = ...,
  1153. ) -> None:
  1154. ...
  1155. @deprecate_nonkeyword_arguments(
  1156. version="3.0", allowed_args=["self", "buf"], name="to_string"
  1157. )
  1158. @Substitution(
  1159. header_type="bool or list of str",
  1160. header="Write out the column names. If a list of columns "
  1161. "is given, it is assumed to be aliases for the "
  1162. "column names",
  1163. col_space_type="int, list or dict of int",
  1164. col_space="The minimum width of each column. If a list of ints is given "
  1165. "every integers corresponds with one column. If a dict is given, the key "
  1166. "references the column, while the value defines the space to use.",
  1167. )
  1168. @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
  1169. def to_string(
  1170. self,
  1171. buf: FilePath | WriteBuffer[str] | None = None,
  1172. columns: Axes | None = None,
  1173. col_space: int | list[int] | dict[Hashable, int] | None = None,
  1174. header: bool | SequenceNotStr[str] = True,
  1175. index: bool = True,
  1176. na_rep: str = "NaN",
  1177. formatters: fmt.FormattersType | None = None,
  1178. float_format: fmt.FloatFormatType | None = None,
  1179. sparsify: bool | None = None,
  1180. index_names: bool = True,
  1181. justify: str | None = None,
  1182. max_rows: int | None = None,
  1183. max_cols: int | None = None,
  1184. show_dimensions: bool = False,
  1185. decimal: str = ".",
  1186. line_width: int | None = None,
  1187. min_rows: int | None = None,
  1188. max_colwidth: int | None = None,
  1189. encoding: str | None = None,
  1190. ) -> str | None:
  1191. """
  1192. Render a DataFrame to a console-friendly tabular output.
  1193. %(shared_params)s
  1194. line_width : int, optional
  1195. Width to wrap a line in characters.
  1196. min_rows : int, optional
  1197. The number of rows to display in the console in a truncated repr
  1198. (when number of rows is above `max_rows`).
  1199. max_colwidth : int, optional
  1200. Max width to truncate each column in characters. By default, no limit.
  1201. encoding : str, default "utf-8"
  1202. Set character encoding.
  1203. %(returns)s
  1204. See Also
  1205. --------
  1206. to_html : Convert DataFrame to HTML.
  1207. Examples
  1208. --------
  1209. >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
  1210. >>> df = pd.DataFrame(d)
  1211. >>> print(df.to_string())
  1212. col1 col2
  1213. 0 1 4
  1214. 1 2 5
  1215. 2 3 6
  1216. """
  1217. from pandas import option_context
  1218. with option_context("display.max_colwidth", max_colwidth):
  1219. formatter = fmt.DataFrameFormatter(
  1220. self,
  1221. columns=columns,
  1222. col_space=col_space,
  1223. na_rep=na_rep,
  1224. formatters=formatters,
  1225. float_format=float_format,
  1226. sparsify=sparsify,
  1227. justify=justify,
  1228. index_names=index_names,
  1229. header=header,
  1230. index=index,
  1231. min_rows=min_rows,
  1232. max_rows=max_rows,
  1233. max_cols=max_cols,
  1234. show_dimensions=show_dimensions,
  1235. decimal=decimal,
  1236. )
  1237. return fmt.DataFrameRenderer(formatter).to_string(
  1238. buf=buf,
  1239. encoding=encoding,
  1240. line_width=line_width,
  1241. )
  1242. def _get_values_for_csv(
  1243. self,
  1244. *,
  1245. float_format: FloatFormatType | None,
  1246. date_format: str | None,
  1247. decimal: str,
  1248. na_rep: str,
  1249. quoting, # int csv.QUOTE_FOO from stdlib
  1250. ) -> Self:
  1251. # helper used by to_csv
  1252. mgr = self._mgr.get_values_for_csv(
  1253. float_format=float_format,
  1254. date_format=date_format,
  1255. decimal=decimal,
  1256. na_rep=na_rep,
  1257. quoting=quoting,
  1258. )
  1259. # error: Incompatible return value type (got "DataFrame", expected "Self")
  1260. return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value]
  1261. # ----------------------------------------------------------------------
  1262. @property
  1263. def style(self) -> Styler:
  1264. """
  1265. Returns a Styler object.
  1266. Contains methods for building a styled HTML representation of the DataFrame.
  1267. See Also
  1268. --------
  1269. io.formats.style.Styler : Helps style a DataFrame or Series according to the
  1270. data with HTML and CSS.
  1271. Examples
  1272. --------
  1273. >>> df = pd.DataFrame({'A': [1, 2, 3]})
  1274. >>> df.style # doctest: +SKIP
  1275. Please see
  1276. `Table Visualization <../../user_guide/style.ipynb>`_ for more examples.
  1277. """
  1278. # Raise AttributeError so that inspect works even if jinja2 is not installed.
  1279. has_jinja2 = import_optional_dependency("jinja2", errors="ignore")
  1280. if not has_jinja2:
  1281. raise AttributeError("The '.style' accessor requires jinja2")
  1282. from pandas.io.formats.style import Styler
  1283. return Styler(self)
  1284. _shared_docs[
  1285. "items"
  1286. ] = r"""
  1287. Iterate over (column name, Series) pairs.
  1288. Iterates over the DataFrame columns, returning a tuple with
  1289. the column name and the content as a Series.
  1290. Yields
  1291. ------
  1292. label : object
  1293. The column names for the DataFrame being iterated over.
  1294. content : Series
  1295. The column entries belonging to each label, as a Series.
  1296. See Also
  1297. --------
  1298. DataFrame.iterrows : Iterate over DataFrame rows as
  1299. (index, Series) pairs.
  1300. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
  1301. of the values.
  1302. Examples
  1303. --------
  1304. >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
  1305. ... 'population': [1864, 22000, 80000]},
  1306. ... index=['panda', 'polar', 'koala'])
  1307. >>> df
  1308. species population
  1309. panda bear 1864
  1310. polar bear 22000
  1311. koala marsupial 80000
  1312. >>> for label, content in df.items():
  1313. ... print(f'label: {label}')
  1314. ... print(f'content: {content}', sep='\n')
  1315. ...
  1316. label: species
  1317. content:
  1318. panda bear
  1319. polar bear
  1320. koala marsupial
  1321. Name: species, dtype: object
  1322. label: population
  1323. content:
  1324. panda 1864
  1325. polar 22000
  1326. koala 80000
  1327. Name: population, dtype: int64
  1328. """
  1329. @Appender(_shared_docs["items"])
  1330. def items(self) -> Iterable[tuple[Hashable, Series]]:
  1331. if self.columns.is_unique and hasattr(self, "_item_cache"):
  1332. for k in self.columns:
  1333. yield k, self._get_item_cache(k)
  1334. else:
  1335. for i, k in enumerate(self.columns):
  1336. yield k, self._ixs(i, axis=1)
  1337. def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
  1338. """
  1339. Iterate over DataFrame rows as (index, Series) pairs.
  1340. Yields
  1341. ------
  1342. index : label or tuple of label
  1343. The index of the row. A tuple for a `MultiIndex`.
  1344. data : Series
  1345. The data of the row as a Series.
  1346. See Also
  1347. --------
  1348. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
  1349. DataFrame.items : Iterate over (column name, Series) pairs.
  1350. Notes
  1351. -----
  1352. 1. Because ``iterrows`` returns a Series for each row,
  1353. it does **not** preserve dtypes across the rows (dtypes are
  1354. preserved across columns for DataFrames).
  1355. To preserve dtypes while iterating over the rows, it is better
  1356. to use :meth:`itertuples` which returns namedtuples of the values
  1357. and which is generally faster than ``iterrows``.
  1358. 2. You should **never modify** something you are iterating over.
  1359. This is not guaranteed to work in all cases. Depending on the
  1360. data types, the iterator returns a copy and not a view, and writing
  1361. to it will have no effect.
  1362. Examples
  1363. --------
  1364. >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
  1365. >>> row = next(df.iterrows())[1]
  1366. >>> row
  1367. int 1.0
  1368. float 1.5
  1369. Name: 0, dtype: float64
  1370. >>> print(row['int'].dtype)
  1371. float64
  1372. >>> print(df['int'].dtype)
  1373. int64
  1374. """
  1375. columns = self.columns
  1376. klass = self._constructor_sliced
  1377. using_cow = using_copy_on_write()
  1378. for k, v in zip(self.index, self.values):
  1379. s = klass(v, index=columns, name=k).__finalize__(self)
  1380. if using_cow and self._mgr.is_single_block:
  1381. s._mgr.add_references(self._mgr) # type: ignore[arg-type]
  1382. yield k, s
  1383. def itertuples(
  1384. self, index: bool = True, name: str | None = "Pandas"
  1385. ) -> Iterable[tuple[Any, ...]]:
  1386. """
  1387. Iterate over DataFrame rows as namedtuples.
  1388. Parameters
  1389. ----------
  1390. index : bool, default True
  1391. If True, return the index as the first element of the tuple.
  1392. name : str or None, default "Pandas"
  1393. The name of the returned namedtuples or None to return regular
  1394. tuples.
  1395. Returns
  1396. -------
  1397. iterator
  1398. An object to iterate over namedtuples for each row in the
  1399. DataFrame with the first field possibly being the index and
  1400. following fields being the column values.
  1401. See Also
  1402. --------
  1403. DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
  1404. pairs.
  1405. DataFrame.items : Iterate over (column name, Series) pairs.
  1406. Notes
  1407. -----
  1408. The column names will be renamed to positional names if they are
  1409. invalid Python identifiers, repeated, or start with an underscore.
  1410. Examples
  1411. --------
  1412. >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
  1413. ... index=['dog', 'hawk'])
  1414. >>> df
  1415. num_legs num_wings
  1416. dog 4 0
  1417. hawk 2 2
  1418. >>> for row in df.itertuples():
  1419. ... print(row)
  1420. ...
  1421. Pandas(Index='dog', num_legs=4, num_wings=0)
  1422. Pandas(Index='hawk', num_legs=2, num_wings=2)
  1423. By setting the `index` parameter to False we can remove the index
  1424. as the first element of the tuple:
  1425. >>> for row in df.itertuples(index=False):
  1426. ... print(row)
  1427. ...
  1428. Pandas(num_legs=4, num_wings=0)
  1429. Pandas(num_legs=2, num_wings=2)
  1430. With the `name` parameter set we set a custom name for the yielded
  1431. namedtuples:
  1432. >>> for row in df.itertuples(name='Animal'):
  1433. ... print(row)
  1434. ...
  1435. Animal(Index='dog', num_legs=4, num_wings=0)
  1436. Animal(Index='hawk', num_legs=2, num_wings=2)
  1437. """
  1438. arrays = []
  1439. fields = list(self.columns)
  1440. if index:
  1441. arrays.append(self.index)
  1442. fields.insert(0, "Index")
  1443. # use integer indexing because of possible duplicate column names
  1444. arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
  1445. if name is not None:
  1446. # https://github.com/python/mypy/issues/9046
  1447. # error: namedtuple() expects a string literal as the first argument
  1448. itertuple = collections.namedtuple( # type: ignore[misc]
  1449. name, fields, rename=True
  1450. )
  1451. return map(itertuple._make, zip(*arrays))
  1452. # fallback to regular tuples
  1453. return zip(*arrays)
  1454. def __len__(self) -> int:
  1455. """
  1456. Returns length of info axis, but here we use the index.
  1457. """
  1458. return len(self.index)
  1459. @overload
  1460. def dot(self, other: Series) -> Series:
  1461. ...
  1462. @overload
  1463. def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
  1464. ...
  1465. def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
  1466. """
  1467. Compute the matrix multiplication between the DataFrame and other.
  1468. This method computes the matrix product between the DataFrame and the
  1469. values of an other Series, DataFrame or a numpy array.
  1470. It can also be called using ``self @ other``.
  1471. Parameters
  1472. ----------
  1473. other : Series, DataFrame or array-like
  1474. The other object to compute the matrix product with.
  1475. Returns
  1476. -------
  1477. Series or DataFrame
  1478. If other is a Series, return the matrix product between self and
  1479. other as a Series. If other is a DataFrame or a numpy.array, return
  1480. the matrix product of self and other in a DataFrame of a np.array.
  1481. See Also
  1482. --------
  1483. Series.dot: Similar method for Series.
  1484. Notes
  1485. -----
  1486. The dimensions of DataFrame and other must be compatible in order to
  1487. compute the matrix multiplication. In addition, the column names of
  1488. DataFrame and the index of other must contain the same values, as they
  1489. will be aligned prior to the multiplication.
  1490. The dot method for Series computes the inner product, instead of the
  1491. matrix product here.
  1492. Examples
  1493. --------
  1494. Here we multiply a DataFrame with a Series.
  1495. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
  1496. >>> s = pd.Series([1, 1, 2, 1])
  1497. >>> df.dot(s)
  1498. 0 -4
  1499. 1 5
  1500. dtype: int64
  1501. Here we multiply a DataFrame with another DataFrame.
  1502. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
  1503. >>> df.dot(other)
  1504. 0 1
  1505. 0 1 4
  1506. 1 2 2
  1507. Note that the dot method give the same result as @
  1508. >>> df @ other
  1509. 0 1
  1510. 0 1 4
  1511. 1 2 2
  1512. The dot method works also if other is an np.array.
  1513. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
  1514. >>> df.dot(arr)
  1515. 0 1
  1516. 0 1 4
  1517. 1 2 2
  1518. Note how shuffling of the objects does not change the result.
  1519. >>> s2 = s.reindex([1, 0, 2, 3])
  1520. >>> df.dot(s2)
  1521. 0 -4
  1522. 1 5
  1523. dtype: int64
  1524. """
  1525. if isinstance(other, (Series, DataFrame)):
  1526. common = self.columns.union(other.index)
  1527. if len(common) > len(self.columns) or len(common) > len(other.index):
  1528. raise ValueError("matrices are not aligned")
  1529. left = self.reindex(columns=common, copy=False)
  1530. right = other.reindex(index=common, copy=False)
  1531. lvals = left.values
  1532. rvals = right._values
  1533. else:
  1534. left = self
  1535. lvals = self.values
  1536. rvals = np.asarray(other)
  1537. if lvals.shape[1] != rvals.shape[0]:
  1538. raise ValueError(
  1539. f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
  1540. )
  1541. if isinstance(other, DataFrame):
  1542. common_type = find_common_type(list(self.dtypes) + list(other.dtypes))
  1543. return self._constructor(
  1544. np.dot(lvals, rvals),
  1545. index=left.index,
  1546. columns=other.columns,
  1547. copy=False,
  1548. dtype=common_type,
  1549. )
  1550. elif isinstance(other, Series):
  1551. common_type = find_common_type(list(self.dtypes) + [other.dtypes])
  1552. return self._constructor_sliced(
  1553. np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type
  1554. )
  1555. elif isinstance(rvals, (np.ndarray, Index)):
  1556. result = np.dot(lvals, rvals)
  1557. if result.ndim == 2:
  1558. return self._constructor(result, index=left.index, copy=False)
  1559. else:
  1560. return self._constructor_sliced(result, index=left.index, copy=False)
  1561. else: # pragma: no cover
  1562. raise TypeError(f"unsupported type: {type(other)}")
  1563. @overload
  1564. def __matmul__(self, other: Series) -> Series:
  1565. ...
  1566. @overload
  1567. def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
  1568. ...
  1569. def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
  1570. """
  1571. Matrix multiplication using binary `@` operator.
  1572. """
  1573. return self.dot(other)
  1574. def __rmatmul__(self, other) -> DataFrame:
  1575. """
  1576. Matrix multiplication using binary `@` operator.
  1577. """
  1578. try:
  1579. return self.T.dot(np.transpose(other)).T
  1580. except ValueError as err:
  1581. if "shape mismatch" not in str(err):
  1582. raise
  1583. # GH#21581 give exception message for original shapes
  1584. msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
  1585. raise ValueError(msg) from err
  1586. # ----------------------------------------------------------------------
  1587. # IO methods (to / from other formats)
  1588. @classmethod
  1589. def from_dict(
  1590. cls,
  1591. data: dict,
  1592. orient: FromDictOrient = "columns",
  1593. dtype: Dtype | None = None,
  1594. columns: Axes | None = None,
  1595. ) -> DataFrame:
  1596. """
  1597. Construct DataFrame from dict of array-like or dicts.
  1598. Creates DataFrame object from dictionary by columns or by index
  1599. allowing dtype specification.
  1600. Parameters
  1601. ----------
  1602. data : dict
  1603. Of the form {field : array-like} or {field : dict}.
  1604. orient : {'columns', 'index', 'tight'}, default 'columns'
  1605. The "orientation" of the data. If the keys of the passed dict
  1606. should be the columns of the resulting DataFrame, pass 'columns'
  1607. (default). Otherwise if the keys should be rows, pass 'index'.
  1608. If 'tight', assume a dict with keys ['index', 'columns', 'data',
  1609. 'index_names', 'column_names'].
  1610. .. versionadded:: 1.4.0
  1611. 'tight' as an allowed value for the ``orient`` argument
  1612. dtype : dtype, default None
  1613. Data type to force after DataFrame construction, otherwise infer.
  1614. columns : list, default None
  1615. Column labels to use when ``orient='index'``. Raises a ValueError
  1616. if used with ``orient='columns'`` or ``orient='tight'``.
  1617. Returns
  1618. -------
  1619. DataFrame
  1620. See Also
  1621. --------
  1622. DataFrame.from_records : DataFrame from structured ndarray, sequence
  1623. of tuples or dicts, or DataFrame.
  1624. DataFrame : DataFrame object creation using constructor.
  1625. DataFrame.to_dict : Convert the DataFrame to a dictionary.
  1626. Examples
  1627. --------
  1628. By default the keys of the dict become the DataFrame columns:
  1629. >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
  1630. >>> pd.DataFrame.from_dict(data)
  1631. col_1 col_2
  1632. 0 3 a
  1633. 1 2 b
  1634. 2 1 c
  1635. 3 0 d
  1636. Specify ``orient='index'`` to create the DataFrame using dictionary
  1637. keys as rows:
  1638. >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
  1639. >>> pd.DataFrame.from_dict(data, orient='index')
  1640. 0 1 2 3
  1641. row_1 3 2 1 0
  1642. row_2 a b c d
  1643. When using the 'index' orientation, the column names can be
  1644. specified manually:
  1645. >>> pd.DataFrame.from_dict(data, orient='index',
  1646. ... columns=['A', 'B', 'C', 'D'])
  1647. A B C D
  1648. row_1 3 2 1 0
  1649. row_2 a b c d
  1650. Specify ``orient='tight'`` to create the DataFrame using a 'tight'
  1651. format:
  1652. >>> data = {'index': [('a', 'b'), ('a', 'c')],
  1653. ... 'columns': [('x', 1), ('y', 2)],
  1654. ... 'data': [[1, 3], [2, 4]],
  1655. ... 'index_names': ['n1', 'n2'],
  1656. ... 'column_names': ['z1', 'z2']}
  1657. >>> pd.DataFrame.from_dict(data, orient='tight')
  1658. z1 x y
  1659. z2 1 2
  1660. n1 n2
  1661. a b 1 3
  1662. c 2 4
  1663. """
  1664. index = None
  1665. orient = orient.lower() # type: ignore[assignment]
  1666. if orient == "index":
  1667. if len(data) > 0:
  1668. # TODO speed up Series case
  1669. if isinstance(next(iter(data.values())), (Series, dict)):
  1670. data = _from_nested_dict(data)
  1671. else:
  1672. index = list(data.keys())
  1673. # error: Incompatible types in assignment (expression has type
  1674. # "List[Any]", variable has type "Dict[Any, Any]")
  1675. data = list(data.values()) # type: ignore[assignment]
  1676. elif orient in ("columns", "tight"):
  1677. if columns is not None:
  1678. raise ValueError(f"cannot use columns parameter with orient='{orient}'")
  1679. else: # pragma: no cover
  1680. raise ValueError(
  1681. f"Expected 'index', 'columns' or 'tight' for orient parameter. "
  1682. f"Got '{orient}' instead"
  1683. )
  1684. if orient != "tight":
  1685. return cls(data, index=index, columns=columns, dtype=dtype)
  1686. else:
  1687. realdata = data["data"]
  1688. def create_index(indexlist, namelist):
  1689. index: Index
  1690. if len(namelist) > 1:
  1691. index = MultiIndex.from_tuples(indexlist, names=namelist)
  1692. else:
  1693. index = Index(indexlist, name=namelist[0])
  1694. return index
  1695. index = create_index(data["index"], data["index_names"])
  1696. columns = create_index(data["columns"], data["column_names"])
  1697. return cls(realdata, index=index, columns=columns, dtype=dtype)
  1698. def to_numpy(
  1699. self,
  1700. dtype: npt.DTypeLike | None = None,
  1701. copy: bool = False,
  1702. na_value: object = lib.no_default,
  1703. ) -> np.ndarray:
  1704. """
  1705. Convert the DataFrame to a NumPy array.
  1706. By default, the dtype of the returned array will be the common NumPy
  1707. dtype of all types in the DataFrame. For example, if the dtypes are
  1708. ``float16`` and ``float32``, the results dtype will be ``float32``.
  1709. This may require copying data and coercing values, which may be
  1710. expensive.
  1711. Parameters
  1712. ----------
  1713. dtype : str or numpy.dtype, optional
  1714. The dtype to pass to :meth:`numpy.asarray`.
  1715. copy : bool, default False
  1716. Whether to ensure that the returned value is not a view on
  1717. another array. Note that ``copy=False`` does not *ensure* that
  1718. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  1719. a copy is made, even if not strictly necessary.
  1720. na_value : Any, optional
  1721. The value to use for missing values. The default value depends
  1722. on `dtype` and the dtypes of the DataFrame columns.
  1723. Returns
  1724. -------
  1725. numpy.ndarray
  1726. See Also
  1727. --------
  1728. Series.to_numpy : Similar method for Series.
  1729. Examples
  1730. --------
  1731. >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
  1732. array([[1, 3],
  1733. [2, 4]])
  1734. With heterogeneous data, the lowest common type will have to
  1735. be used.
  1736. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
  1737. >>> df.to_numpy()
  1738. array([[1. , 3. ],
  1739. [2. , 4.5]])
  1740. For a mix of numeric and non-numeric types, the output array will
  1741. have object dtype.
  1742. >>> df['C'] = pd.date_range('2000', periods=2)
  1743. >>> df.to_numpy()
  1744. array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
  1745. [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
  1746. """
  1747. if dtype is not None:
  1748. dtype = np.dtype(dtype)
  1749. result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
  1750. if result.dtype is not dtype:
  1751. result = np.asarray(result, dtype=dtype)
  1752. return result
  1753. def _create_data_for_split_and_tight_to_dict(
  1754. self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
  1755. ) -> list:
  1756. """
  1757. Simple helper method to create data for to ``to_dict(orient="split")`` and
  1758. ``to_dict(orient="tight")`` to create the main output data
  1759. """
  1760. if are_all_object_dtype_cols:
  1761. data = [
  1762. list(map(maybe_box_native, t))
  1763. for t in self.itertuples(index=False, name=None)
  1764. ]
  1765. else:
  1766. data = [list(t) for t in self.itertuples(index=False, name=None)]
  1767. if object_dtype_indices:
  1768. # If we have object_dtype_cols, apply maybe_box_naive after list
  1769. # comprehension for perf
  1770. for row in data:
  1771. for i in object_dtype_indices:
  1772. row[i] = maybe_box_native(row[i])
  1773. return data
  1774. @overload
  1775. def to_dict(
  1776. self,
  1777. orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
  1778. *,
  1779. into: type[MutableMappingT] | MutableMappingT,
  1780. index: bool = ...,
  1781. ) -> MutableMappingT:
  1782. ...
  1783. @overload
  1784. def to_dict(
  1785. self,
  1786. orient: Literal["records"],
  1787. *,
  1788. into: type[MutableMappingT] | MutableMappingT,
  1789. index: bool = ...,
  1790. ) -> list[MutableMappingT]:
  1791. ...
  1792. @overload
  1793. def to_dict(
  1794. self,
  1795. orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
  1796. *,
  1797. into: type[dict] = ...,
  1798. index: bool = ...,
  1799. ) -> dict:
  1800. ...
  1801. @overload
  1802. def to_dict(
  1803. self,
  1804. orient: Literal["records"],
  1805. *,
  1806. into: type[dict] = ...,
  1807. index: bool = ...,
  1808. ) -> list[dict]:
  1809. ...
  1810. # error: Incompatible default for argument "into" (default has type "type
  1811. # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")
  1812. @deprecate_nonkeyword_arguments(
  1813. version="3.0", allowed_args=["self", "orient"], name="to_dict"
  1814. )
  1815. def to_dict(
  1816. self,
  1817. orient: Literal[
  1818. "dict", "list", "series", "split", "tight", "records", "index"
  1819. ] = "dict",
  1820. into: type[MutableMappingT]
  1821. | MutableMappingT = dict, # type: ignore[assignment]
  1822. index: bool = True,
  1823. ) -> MutableMappingT | list[MutableMappingT]:
  1824. """
  1825. Convert the DataFrame to a dictionary.
  1826. The type of the key-value pairs can be customized with the parameters
  1827. (see below).
  1828. Parameters
  1829. ----------
  1830. orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
  1831. Determines the type of the values of the dictionary.
  1832. - 'dict' (default) : dict like {column -> {index -> value}}
  1833. - 'list' : dict like {column -> [values]}
  1834. - 'series' : dict like {column -> Series(values)}
  1835. - 'split' : dict like
  1836. {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
  1837. - 'tight' : dict like
  1838. {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
  1839. 'index_names' -> [index.names], 'column_names' -> [column.names]}
  1840. - 'records' : list like
  1841. [{column -> value}, ... , {column -> value}]
  1842. - 'index' : dict like {index -> {column -> value}}
  1843. .. versionadded:: 1.4.0
  1844. 'tight' as an allowed value for the ``orient`` argument
  1845. into : class, default dict
  1846. The collections.abc.MutableMapping subclass used for all Mappings
  1847. in the return value. Can be the actual class or an empty
  1848. instance of the mapping type you want. If you want a
  1849. collections.defaultdict, you must pass it initialized.
  1850. index : bool, default True
  1851. Whether to include the index item (and index_names item if `orient`
  1852. is 'tight') in the returned dictionary. Can only be ``False``
  1853. when `orient` is 'split' or 'tight'.
  1854. .. versionadded:: 2.0.0
  1855. Returns
  1856. -------
  1857. dict, list or collections.abc.MutableMapping
  1858. Return a collections.abc.MutableMapping object representing the
  1859. DataFrame. The resulting transformation depends on the `orient`
  1860. parameter.
  1861. See Also
  1862. --------
  1863. DataFrame.from_dict: Create a DataFrame from a dictionary.
  1864. DataFrame.to_json: Convert a DataFrame to JSON format.
  1865. Examples
  1866. --------
  1867. >>> df = pd.DataFrame({'col1': [1, 2],
  1868. ... 'col2': [0.5, 0.75]},
  1869. ... index=['row1', 'row2'])
  1870. >>> df
  1871. col1 col2
  1872. row1 1 0.50
  1873. row2 2 0.75
  1874. >>> df.to_dict()
  1875. {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
  1876. You can specify the return orientation.
  1877. >>> df.to_dict('series')
  1878. {'col1': row1 1
  1879. row2 2
  1880. Name: col1, dtype: int64,
  1881. 'col2': row1 0.50
  1882. row2 0.75
  1883. Name: col2, dtype: float64}
  1884. >>> df.to_dict('split')
  1885. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1886. 'data': [[1, 0.5], [2, 0.75]]}
  1887. >>> df.to_dict('records')
  1888. [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
  1889. >>> df.to_dict('index')
  1890. {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
  1891. >>> df.to_dict('tight')
  1892. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1893. 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
  1894. You can also specify the mapping type.
  1895. >>> from collections import OrderedDict, defaultdict
  1896. >>> df.to_dict(into=OrderedDict)
  1897. OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
  1898. ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
  1899. If you want a `defaultdict`, you need to initialize it:
  1900. >>> dd = defaultdict(list)
  1901. >>> df.to_dict('records', into=dd)
  1902. [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
  1903. defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
  1904. """
  1905. from pandas.core.methods.to_dict import to_dict
  1906. return to_dict(self, orient, into=into, index=index)
  1907. @deprecate_nonkeyword_arguments(
  1908. version="3.0", allowed_args=["self", "destination_table"], name="to_gbq"
  1909. )
  1910. def to_gbq(
  1911. self,
  1912. destination_table: str,
  1913. project_id: str | None = None,
  1914. chunksize: int | None = None,
  1915. reauth: bool = False,
  1916. if_exists: ToGbqIfexist = "fail",
  1917. auth_local_webserver: bool = True,
  1918. table_schema: list[dict[str, str]] | None = None,
  1919. location: str | None = None,
  1920. progress_bar: bool = True,
  1921. credentials=None,
  1922. ) -> None:
  1923. """
  1924. Write a DataFrame to a Google BigQuery table.
  1925. .. deprecated:: 2.2.0
  1926. Please use ``pandas_gbq.to_gbq`` instead.
  1927. This function requires the `pandas-gbq package
  1928. <https://pandas-gbq.readthedocs.io>`__.
  1929. See the `How to authenticate with Google BigQuery
  1930. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  1931. guide for authentication instructions.
  1932. Parameters
  1933. ----------
  1934. destination_table : str
  1935. Name of table to be written, in the form ``dataset.tablename``.
  1936. project_id : str, optional
  1937. Google BigQuery Account project ID. Optional when available from
  1938. the environment.
  1939. chunksize : int, optional
  1940. Number of rows to be inserted in each chunk from the dataframe.
  1941. Set to ``None`` to load the whole dataframe at once.
  1942. reauth : bool, default False
  1943. Force Google BigQuery to re-authenticate the user. This is useful
  1944. if multiple accounts are used.
  1945. if_exists : str, default 'fail'
  1946. Behavior when the destination table exists. Value can be one of:
  1947. ``'fail'``
  1948. If table exists raise pandas_gbq.gbq.TableCreationError.
  1949. ``'replace'``
  1950. If table exists, drop it, recreate it, and insert data.
  1951. ``'append'``
  1952. If table exists, insert data. Create if does not exist.
  1953. auth_local_webserver : bool, default True
  1954. Use the `local webserver flow`_ instead of the `console flow`_
  1955. when getting user credentials.
  1956. .. _local webserver flow:
  1957. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  1958. .. _console flow:
  1959. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  1960. *New in version 0.2.0 of pandas-gbq*.
  1961. .. versionchanged:: 1.5.0
  1962. Default value is changed to ``True``. Google has deprecated the
  1963. ``auth_local_webserver = False`` `"out of band" (copy-paste)
  1964. flow
  1965. <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
  1966. table_schema : list of dicts, optional
  1967. List of BigQuery table fields to which according DataFrame
  1968. columns conform to, e.g. ``[{'name': 'col1', 'type':
  1969. 'STRING'},...]``. If schema is not provided, it will be
  1970. generated according to dtypes of DataFrame columns. See
  1971. BigQuery API documentation on available names of a field.
  1972. *New in version 0.3.1 of pandas-gbq*.
  1973. location : str, optional
  1974. Location where the load job should run. See the `BigQuery locations
  1975. documentation
  1976. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  1977. list of available locations. The location must match that of the
  1978. target dataset.
  1979. *New in version 0.5.0 of pandas-gbq*.
  1980. progress_bar : bool, default True
  1981. Use the library `tqdm` to show the progress bar for the upload,
  1982. chunk by chunk.
  1983. *New in version 0.5.0 of pandas-gbq*.
  1984. credentials : google.auth.credentials.Credentials, optional
  1985. Credentials for accessing Google APIs. Use this parameter to
  1986. override default credentials, such as to use Compute Engine
  1987. :class:`google.auth.compute_engine.Credentials` or Service
  1988. Account :class:`google.oauth2.service_account.Credentials`
  1989. directly.
  1990. *New in version 0.8.0 of pandas-gbq*.
  1991. See Also
  1992. --------
  1993. pandas_gbq.to_gbq : This function in the pandas-gbq library.
  1994. read_gbq : Read a DataFrame from Google BigQuery.
  1995. Examples
  1996. --------
  1997. Example taken from `Google BigQuery documentation
  1998. <https://cloud.google.com/bigquery/docs/samples/bigquery-pandas-gbq-to-gbq-simple>`_
  1999. >>> project_id = "my-project"
  2000. >>> table_id = 'my_dataset.my_table'
  2001. >>> df = pd.DataFrame({
  2002. ... "my_string": ["a", "b", "c"],
  2003. ... "my_int64": [1, 2, 3],
  2004. ... "my_float64": [4.0, 5.0, 6.0],
  2005. ... "my_bool1": [True, False, True],
  2006. ... "my_bool2": [False, True, False],
  2007. ... "my_dates": pd.date_range("now", periods=3),
  2008. ... }
  2009. ... )
  2010. >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP
  2011. """
  2012. from pandas.io import gbq
  2013. gbq.to_gbq(
  2014. self,
  2015. destination_table,
  2016. project_id=project_id,
  2017. chunksize=chunksize,
  2018. reauth=reauth,
  2019. if_exists=if_exists,
  2020. auth_local_webserver=auth_local_webserver,
  2021. table_schema=table_schema,
  2022. location=location,
  2023. progress_bar=progress_bar,
  2024. credentials=credentials,
  2025. )
  2026. @classmethod
  2027. def from_records(
  2028. cls,
  2029. data,
  2030. index=None,
  2031. exclude=None,
  2032. columns=None,
  2033. coerce_float: bool = False,
  2034. nrows: int | None = None,
  2035. ) -> DataFrame:
  2036. """
  2037. Convert structured or record ndarray to DataFrame.
  2038. Creates a DataFrame object from a structured ndarray, sequence of
  2039. tuples or dicts, or DataFrame.
  2040. Parameters
  2041. ----------
  2042. data : structured ndarray, sequence of tuples or dicts, or DataFrame
  2043. Structured input data.
  2044. .. deprecated:: 2.1.0
  2045. Passing a DataFrame is deprecated.
  2046. index : str, list of fields, array-like
  2047. Field of array to use as the index, alternately a specific set of
  2048. input labels to use.
  2049. exclude : sequence, default None
  2050. Columns or fields to exclude.
  2051. columns : sequence, default None
  2052. Column names to use. If the passed data do not have names
  2053. associated with them, this argument provides names for the
  2054. columns. Otherwise this argument indicates the order of the columns
  2055. in the result (any names not found in the data will become all-NA
  2056. columns).
  2057. coerce_float : bool, default False
  2058. Attempt to convert values of non-string, non-numeric objects (like
  2059. decimal.Decimal) to floating point, useful for SQL result sets.
  2060. nrows : int, default None
  2061. Number of rows to read if data is an iterator.
  2062. Returns
  2063. -------
  2064. DataFrame
  2065. See Also
  2066. --------
  2067. DataFrame.from_dict : DataFrame from dict of array-like or dicts.
  2068. DataFrame : DataFrame object creation using constructor.
  2069. Examples
  2070. --------
  2071. Data can be provided as a structured ndarray:
  2072. >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
  2073. ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
  2074. >>> pd.DataFrame.from_records(data)
  2075. col_1 col_2
  2076. 0 3 a
  2077. 1 2 b
  2078. 2 1 c
  2079. 3 0 d
  2080. Data can be provided as a list of dicts:
  2081. >>> data = [{'col_1': 3, 'col_2': 'a'},
  2082. ... {'col_1': 2, 'col_2': 'b'},
  2083. ... {'col_1': 1, 'col_2': 'c'},
  2084. ... {'col_1': 0, 'col_2': 'd'}]
  2085. >>> pd.DataFrame.from_records(data)
  2086. col_1 col_2
  2087. 0 3 a
  2088. 1 2 b
  2089. 2 1 c
  2090. 3 0 d
  2091. Data can be provided as a list of tuples with corresponding columns:
  2092. >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
  2093. >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
  2094. col_1 col_2
  2095. 0 3 a
  2096. 1 2 b
  2097. 2 1 c
  2098. 3 0 d
  2099. """
  2100. if isinstance(data, DataFrame):
  2101. warnings.warn(
  2102. "Passing a DataFrame to DataFrame.from_records is deprecated. Use "
  2103. "set_index and/or drop to modify the DataFrame instead.",
  2104. FutureWarning,
  2105. stacklevel=find_stack_level(),
  2106. )
  2107. if columns is not None:
  2108. if is_scalar(columns):
  2109. columns = [columns]
  2110. data = data[columns]
  2111. if index is not None:
  2112. data = data.set_index(index)
  2113. if exclude is not None:
  2114. data = data.drop(columns=exclude)
  2115. return data.copy(deep=False)
  2116. result_index = None
  2117. # Make a copy of the input columns so we can modify it
  2118. if columns is not None:
  2119. columns = ensure_index(columns)
  2120. def maybe_reorder(
  2121. arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
  2122. ) -> tuple[list[ArrayLike], Index, Index | None]:
  2123. """
  2124. If our desired 'columns' do not match the data's pre-existing 'arr_columns',
  2125. we re-order our arrays. This is like a pre-emptive (cheap) reindex.
  2126. """
  2127. if len(arrays):
  2128. length = len(arrays[0])
  2129. else:
  2130. length = 0
  2131. result_index = None
  2132. if len(arrays) == 0 and index is None and length == 0:
  2133. result_index = default_index(0)
  2134. arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
  2135. return arrays, arr_columns, result_index
  2136. if is_iterator(data):
  2137. if nrows == 0:
  2138. return cls()
  2139. try:
  2140. first_row = next(data)
  2141. except StopIteration:
  2142. return cls(index=index, columns=columns)
  2143. dtype = None
  2144. if hasattr(first_row, "dtype") and first_row.dtype.names:
  2145. dtype = first_row.dtype
  2146. values = [first_row]
  2147. if nrows is None:
  2148. values += data
  2149. else:
  2150. values.extend(itertools.islice(data, nrows - 1))
  2151. if dtype is not None:
  2152. data = np.array(values, dtype=dtype)
  2153. else:
  2154. data = values
  2155. if isinstance(data, dict):
  2156. if columns is None:
  2157. columns = arr_columns = ensure_index(sorted(data))
  2158. arrays = [data[k] for k in columns]
  2159. else:
  2160. arrays = []
  2161. arr_columns_list = []
  2162. for k, v in data.items():
  2163. if k in columns:
  2164. arr_columns_list.append(k)
  2165. arrays.append(v)
  2166. arr_columns = Index(arr_columns_list)
  2167. arrays, arr_columns, result_index = maybe_reorder(
  2168. arrays, arr_columns, columns, index
  2169. )
  2170. elif isinstance(data, np.ndarray):
  2171. arrays, columns = to_arrays(data, columns)
  2172. arr_columns = columns
  2173. else:
  2174. arrays, arr_columns = to_arrays(data, columns)
  2175. if coerce_float:
  2176. for i, arr in enumerate(arrays):
  2177. if arr.dtype == object:
  2178. # error: Argument 1 to "maybe_convert_objects" has
  2179. # incompatible type "Union[ExtensionArray, ndarray]";
  2180. # expected "ndarray"
  2181. arrays[i] = lib.maybe_convert_objects(
  2182. arr, # type: ignore[arg-type]
  2183. try_float=True,
  2184. )
  2185. arr_columns = ensure_index(arr_columns)
  2186. if columns is None:
  2187. columns = arr_columns
  2188. else:
  2189. arrays, arr_columns, result_index = maybe_reorder(
  2190. arrays, arr_columns, columns, index
  2191. )
  2192. if exclude is None:
  2193. exclude = set()
  2194. else:
  2195. exclude = set(exclude)
  2196. if index is not None:
  2197. if isinstance(index, str) or not hasattr(index, "__iter__"):
  2198. i = columns.get_loc(index)
  2199. exclude.add(index)
  2200. if len(arrays) > 0:
  2201. result_index = Index(arrays[i], name=index)
  2202. else:
  2203. result_index = Index([], name=index)
  2204. else:
  2205. try:
  2206. index_data = [arrays[arr_columns.get_loc(field)] for field in index]
  2207. except (KeyError, TypeError):
  2208. # raised by get_loc, see GH#29258
  2209. result_index = index
  2210. else:
  2211. result_index = ensure_index_from_sequences(index_data, names=index)
  2212. exclude.update(index)
  2213. if any(exclude):
  2214. arr_exclude = [x for x in exclude if x in arr_columns]
  2215. to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
  2216. arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
  2217. columns = columns.drop(exclude)
  2218. manager = _get_option("mode.data_manager", silent=True)
  2219. mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
  2220. df = DataFrame._from_mgr(mgr, axes=mgr.axes)
  2221. if cls is not DataFrame:
  2222. return cls(df, copy=False)
  2223. return df
  2224. def to_records(
  2225. self, index: bool = True, column_dtypes=None, index_dtypes=None
  2226. ) -> np.rec.recarray:
  2227. """
  2228. Convert DataFrame to a NumPy record array.
  2229. Index will be included as the first field of the record array if
  2230. requested.
  2231. Parameters
  2232. ----------
  2233. index : bool, default True
  2234. Include index in resulting record array, stored in 'index'
  2235. field or using the index label, if set.
  2236. column_dtypes : str, type, dict, default None
  2237. If a string or type, the data type to store all columns. If
  2238. a dictionary, a mapping of column names and indices (zero-indexed)
  2239. to specific data types.
  2240. index_dtypes : str, type, dict, default None
  2241. If a string or type, the data type to store all index levels. If
  2242. a dictionary, a mapping of index level names and indices
  2243. (zero-indexed) to specific data types.
  2244. This mapping is applied only if `index=True`.
  2245. Returns
  2246. -------
  2247. numpy.rec.recarray
  2248. NumPy ndarray with the DataFrame labels as fields and each row
  2249. of the DataFrame as entries.
  2250. See Also
  2251. --------
  2252. DataFrame.from_records: Convert structured or record ndarray
  2253. to DataFrame.
  2254. numpy.rec.recarray: An ndarray that allows field access using
  2255. attributes, analogous to typed columns in a
  2256. spreadsheet.
  2257. Examples
  2258. --------
  2259. >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
  2260. ... index=['a', 'b'])
  2261. >>> df
  2262. A B
  2263. a 1 0.50
  2264. b 2 0.75
  2265. >>> df.to_records()
  2266. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  2267. dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
  2268. If the DataFrame index has no label then the recarray field name
  2269. is set to 'index'. If the index has a label then this is used as the
  2270. field name:
  2271. >>> df.index = df.index.rename("I")
  2272. >>> df.to_records()
  2273. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  2274. dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
  2275. The index can be excluded from the record array:
  2276. >>> df.to_records(index=False)
  2277. rec.array([(1, 0.5 ), (2, 0.75)],
  2278. dtype=[('A', '<i8'), ('B', '<f8')])
  2279. Data types can be specified for the columns:
  2280. >>> df.to_records(column_dtypes={"A": "int32"})
  2281. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  2282. dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
  2283. As well as for the index:
  2284. >>> df.to_records(index_dtypes="<S2")
  2285. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  2286. dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
  2287. >>> index_dtypes = f"<S{df.index.str.len().max()}"
  2288. >>> df.to_records(index_dtypes=index_dtypes)
  2289. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  2290. dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
  2291. """
  2292. if index:
  2293. ix_vals = [
  2294. np.asarray(self.index.get_level_values(i))
  2295. for i in range(self.index.nlevels)
  2296. ]
  2297. arrays = ix_vals + [
  2298. np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
  2299. ]
  2300. index_names = list(self.index.names)
  2301. if isinstance(self.index, MultiIndex):
  2302. index_names = com.fill_missing_names(index_names)
  2303. elif index_names[0] is None:
  2304. index_names = ["index"]
  2305. names = [str(name) for name in itertools.chain(index_names, self.columns)]
  2306. else:
  2307. arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
  2308. names = [str(c) for c in self.columns]
  2309. index_names = []
  2310. index_len = len(index_names)
  2311. formats = []
  2312. for i, v in enumerate(arrays):
  2313. index_int = i
  2314. # When the names and arrays are collected, we
  2315. # first collect those in the DataFrame's index,
  2316. # followed by those in its columns.
  2317. #
  2318. # Thus, the total length of the array is:
  2319. # len(index_names) + len(DataFrame.columns).
  2320. #
  2321. # This check allows us to see whether we are
  2322. # handling a name / array in the index or column.
  2323. if index_int < index_len:
  2324. dtype_mapping = index_dtypes
  2325. name = index_names[index_int]
  2326. else:
  2327. index_int -= index_len
  2328. dtype_mapping = column_dtypes
  2329. name = self.columns[index_int]
  2330. # We have a dictionary, so we get the data type
  2331. # associated with the index or column (which can
  2332. # be denoted by its name in the DataFrame or its
  2333. # position in DataFrame's array of indices or
  2334. # columns, whichever is applicable.
  2335. if is_dict_like(dtype_mapping):
  2336. if name in dtype_mapping:
  2337. dtype_mapping = dtype_mapping[name]
  2338. elif index_int in dtype_mapping:
  2339. dtype_mapping = dtype_mapping[index_int]
  2340. else:
  2341. dtype_mapping = None
  2342. # If no mapping can be found, use the array's
  2343. # dtype attribute for formatting.
  2344. #
  2345. # A valid dtype must either be a type or
  2346. # string naming a type.
  2347. if dtype_mapping is None:
  2348. formats.append(v.dtype)
  2349. elif isinstance(dtype_mapping, (type, np.dtype, str)):
  2350. # error: Argument 1 to "append" of "list" has incompatible
  2351. # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
  2352. formats.append(dtype_mapping) # type: ignore[arg-type]
  2353. else:
  2354. element = "row" if i < index_len else "column"
  2355. msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
  2356. raise ValueError(msg)
  2357. return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
  2358. @classmethod
  2359. def _from_arrays(
  2360. cls,
  2361. arrays,
  2362. columns,
  2363. index,
  2364. dtype: Dtype | None = None,
  2365. verify_integrity: bool = True,
  2366. ) -> Self:
  2367. """
  2368. Create DataFrame from a list of arrays corresponding to the columns.
  2369. Parameters
  2370. ----------
  2371. arrays : list-like of arrays
  2372. Each array in the list corresponds to one column, in order.
  2373. columns : list-like, Index
  2374. The column names for the resulting DataFrame.
  2375. index : list-like, Index
  2376. The rows labels for the resulting DataFrame.
  2377. dtype : dtype, optional
  2378. Optional dtype to enforce for all arrays.
  2379. verify_integrity : bool, default True
  2380. Validate and homogenize all input. If set to False, it is assumed
  2381. that all elements of `arrays` are actual arrays how they will be
  2382. stored in a block (numpy ndarray or ExtensionArray), have the same
  2383. length as and are aligned with the index, and that `columns` and
  2384. `index` are ensured to be an Index object.
  2385. Returns
  2386. -------
  2387. DataFrame
  2388. """
  2389. if dtype is not None:
  2390. dtype = pandas_dtype(dtype)
  2391. manager = _get_option("mode.data_manager", silent=True)
  2392. columns = ensure_index(columns)
  2393. if len(columns) != len(arrays):
  2394. raise ValueError("len(columns) must match len(arrays)")
  2395. mgr = arrays_to_mgr(
  2396. arrays,
  2397. columns,
  2398. index,
  2399. dtype=dtype,
  2400. verify_integrity=verify_integrity,
  2401. typ=manager,
  2402. )
  2403. return cls._from_mgr(mgr, axes=mgr.axes)
  2404. @doc(
  2405. storage_options=_shared_docs["storage_options"],
  2406. compression_options=_shared_docs["compression_options"] % "path",
  2407. )
  2408. def to_stata(
  2409. self,
  2410. path: FilePath | WriteBuffer[bytes],
  2411. *,
  2412. convert_dates: dict[Hashable, str] | None = None,
  2413. write_index: bool = True,
  2414. byteorder: ToStataByteorder | None = None,
  2415. time_stamp: datetime.datetime | None = None,
  2416. data_label: str | None = None,
  2417. variable_labels: dict[Hashable, str] | None = None,
  2418. version: int | None = 114,
  2419. convert_strl: Sequence[Hashable] | None = None,
  2420. compression: CompressionOptions = "infer",
  2421. storage_options: StorageOptions | None = None,
  2422. value_labels: dict[Hashable, dict[float, str]] | None = None,
  2423. ) -> None:
  2424. """
  2425. Export DataFrame object to Stata dta format.
  2426. Writes the DataFrame to a Stata dataset file.
  2427. "dta" files contain a Stata dataset.
  2428. Parameters
  2429. ----------
  2430. path : str, path object, or buffer
  2431. String, path object (implementing ``os.PathLike[str]``), or file-like
  2432. object implementing a binary ``write()`` function.
  2433. convert_dates : dict
  2434. Dictionary mapping columns containing datetime types to stata
  2435. internal format to use when writing the dates. Options are 'tc',
  2436. 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
  2437. or a name. Datetime columns that do not have a conversion type
  2438. specified will be converted to 'tc'. Raises NotImplementedError if
  2439. a datetime column has timezone information.
  2440. write_index : bool
  2441. Write the index to Stata dataset.
  2442. byteorder : str
  2443. Can be ">", "<", "little", or "big". default is `sys.byteorder`.
  2444. time_stamp : datetime
  2445. A datetime to use as file creation date. Default is the current
  2446. time.
  2447. data_label : str, optional
  2448. A label for the data set. Must be 80 characters or smaller.
  2449. variable_labels : dict
  2450. Dictionary containing columns as keys and variable labels as
  2451. values. Each label must be 80 characters or smaller.
  2452. version : {{114, 117, 118, 119, None}}, default 114
  2453. Version to use in the output dta file. Set to None to let pandas
  2454. decide between 118 or 119 formats depending on the number of
  2455. columns in the frame. Version 114 can be read by Stata 10 and
  2456. later. Version 117 can be read by Stata 13 or later. Version 118
  2457. is supported in Stata 14 and later. Version 119 is supported in
  2458. Stata 15 and later. Version 114 limits string variables to 244
  2459. characters or fewer while versions 117 and later allow strings
  2460. with lengths up to 2,000,000 characters. Versions 118 and 119
  2461. support Unicode characters, and version 119 supports more than
  2462. 32,767 variables.
  2463. Version 119 should usually only be used when the number of
  2464. variables exceeds the capacity of dta format 118. Exporting
  2465. smaller datasets in format 119 may have unintended consequences,
  2466. and, as of November 2020, Stata SE cannot read version 119 files.
  2467. convert_strl : list, optional
  2468. List of column names to convert to string columns to Stata StrL
  2469. format. Only available if version is 117. Storing strings in the
  2470. StrL format can produce smaller dta files if strings have more than
  2471. 8 characters and values are repeated.
  2472. {compression_options}
  2473. .. versionchanged:: 1.4.0 Zstandard support.
  2474. {storage_options}
  2475. value_labels : dict of dicts
  2476. Dictionary containing columns as keys and dictionaries of column value
  2477. to labels as values. Labels for a single variable must be 32,000
  2478. characters or smaller.
  2479. .. versionadded:: 1.4.0
  2480. Raises
  2481. ------
  2482. NotImplementedError
  2483. * If datetimes contain timezone information
  2484. * Column dtype is not representable in Stata
  2485. ValueError
  2486. * Columns listed in convert_dates are neither datetime64[ns]
  2487. or datetime.datetime
  2488. * Column listed in convert_dates is not in DataFrame
  2489. * Categorical label contains more than 32,000 characters
  2490. See Also
  2491. --------
  2492. read_stata : Import Stata data files.
  2493. io.stata.StataWriter : Low-level writer for Stata data files.
  2494. io.stata.StataWriter117 : Low-level writer for version 117 files.
  2495. Examples
  2496. --------
  2497. >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
  2498. ... 'parrot'],
  2499. ... 'speed': [350, 18, 361, 15]}})
  2500. >>> df.to_stata('animals.dta') # doctest: +SKIP
  2501. """
  2502. if version not in (114, 117, 118, 119, None):
  2503. raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
  2504. if version == 114:
  2505. if convert_strl is not None:
  2506. raise ValueError("strl is not supported in format 114")
  2507. from pandas.io.stata import StataWriter as statawriter
  2508. elif version == 117:
  2509. # Incompatible import of "statawriter" (imported name has type
  2510. # "Type[StataWriter117]", local name has type "Type[StataWriter]")
  2511. from pandas.io.stata import ( # type: ignore[assignment]
  2512. StataWriter117 as statawriter,
  2513. )
  2514. else: # versions 118 and 119
  2515. # Incompatible import of "statawriter" (imported name has type
  2516. # "Type[StataWriter117]", local name has type "Type[StataWriter]")
  2517. from pandas.io.stata import ( # type: ignore[assignment]
  2518. StataWriterUTF8 as statawriter,
  2519. )
  2520. kwargs: dict[str, Any] = {}
  2521. if version is None or version >= 117:
  2522. # strl conversion is only supported >= 117
  2523. kwargs["convert_strl"] = convert_strl
  2524. if version is None or version >= 118:
  2525. # Specifying the version is only supported for UTF8 (118 or 119)
  2526. kwargs["version"] = version
  2527. writer = statawriter(
  2528. path,
  2529. self,
  2530. convert_dates=convert_dates,
  2531. byteorder=byteorder,
  2532. time_stamp=time_stamp,
  2533. data_label=data_label,
  2534. write_index=write_index,
  2535. variable_labels=variable_labels,
  2536. compression=compression,
  2537. storage_options=storage_options,
  2538. value_labels=value_labels,
  2539. **kwargs,
  2540. )
  2541. writer.write_file()
  2542. def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
  2543. """
  2544. Write a DataFrame to the binary Feather format.
  2545. Parameters
  2546. ----------
  2547. path : str, path object, file-like object
  2548. String, path object (implementing ``os.PathLike[str]``), or file-like
  2549. object implementing a binary ``write()`` function. If a string or a path,
  2550. it will be used as Root Directory path when writing a partitioned dataset.
  2551. **kwargs :
  2552. Additional keywords passed to :func:`pyarrow.feather.write_feather`.
  2553. This includes the `compression`, `compression_level`, `chunksize`
  2554. and `version` keywords.
  2555. Notes
  2556. -----
  2557. This function writes the dataframe as a `feather file
  2558. <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
  2559. index. For saving the DataFrame with your custom index use a method that
  2560. supports custom indices e.g. `to_parquet`.
  2561. Examples
  2562. --------
  2563. >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
  2564. >>> df.to_feather("file.feather") # doctest: +SKIP
  2565. """
  2566. from pandas.io.feather_format import to_feather
  2567. to_feather(self, path, **kwargs)
  2568. @deprecate_nonkeyword_arguments(
  2569. version="3.0", allowed_args=["self", "buf"], name="to_markdown"
  2570. )
  2571. @doc(
  2572. Series.to_markdown,
  2573. klass=_shared_doc_kwargs["klass"],
  2574. storage_options=_shared_docs["storage_options"],
  2575. examples="""Examples
  2576. --------
  2577. >>> df = pd.DataFrame(
  2578. ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
  2579. ... )
  2580. >>> print(df.to_markdown())
  2581. | | animal_1 | animal_2 |
  2582. |---:|:-----------|:-----------|
  2583. | 0 | elk | dog |
  2584. | 1 | pig | quetzal |
  2585. Output markdown with a tabulate option.
  2586. >>> print(df.to_markdown(tablefmt="grid"))
  2587. +----+------------+------------+
  2588. | | animal_1 | animal_2 |
  2589. +====+============+============+
  2590. | 0 | elk | dog |
  2591. +----+------------+------------+
  2592. | 1 | pig | quetzal |
  2593. +----+------------+------------+""",
  2594. )
  2595. def to_markdown(
  2596. self,
  2597. buf: FilePath | WriteBuffer[str] | None = None,
  2598. mode: str = "wt",
  2599. index: bool = True,
  2600. storage_options: StorageOptions | None = None,
  2601. **kwargs,
  2602. ) -> str | None:
  2603. if "showindex" in kwargs:
  2604. raise ValueError("Pass 'index' instead of 'showindex")
  2605. kwargs.setdefault("headers", "keys")
  2606. kwargs.setdefault("tablefmt", "pipe")
  2607. kwargs.setdefault("showindex", index)
  2608. tabulate = import_optional_dependency("tabulate")
  2609. result = tabulate.tabulate(self, **kwargs)
  2610. if buf is None:
  2611. return result
  2612. with get_handle(buf, mode, storage_options=storage_options) as handles:
  2613. handles.handle.write(result)
  2614. return None
  2615. @overload
  2616. def to_parquet(
  2617. self,
  2618. path: None = ...,
  2619. engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
  2620. compression: str | None = ...,
  2621. index: bool | None = ...,
  2622. partition_cols: list[str] | None = ...,
  2623. storage_options: StorageOptions = ...,
  2624. **kwargs,
  2625. ) -> bytes:
  2626. ...
  2627. @overload
  2628. def to_parquet(
  2629. self,
  2630. path: FilePath | WriteBuffer[bytes],
  2631. engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
  2632. compression: str | None = ...,
  2633. index: bool | None = ...,
  2634. partition_cols: list[str] | None = ...,
  2635. storage_options: StorageOptions = ...,
  2636. **kwargs,
  2637. ) -> None:
  2638. ...
  2639. @deprecate_nonkeyword_arguments(
  2640. version="3.0", allowed_args=["self", "path"], name="to_parquet"
  2641. )
  2642. @doc(storage_options=_shared_docs["storage_options"])
  2643. def to_parquet(
  2644. self,
  2645. path: FilePath | WriteBuffer[bytes] | None = None,
  2646. engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",
  2647. compression: str | None = "snappy",
  2648. index: bool | None = None,
  2649. partition_cols: list[str] | None = None,
  2650. storage_options: StorageOptions | None = None,
  2651. **kwargs,
  2652. ) -> bytes | None:
  2653. """
  2654. Write a DataFrame to the binary parquet format.
  2655. This function writes the dataframe as a `parquet file
  2656. <https://parquet.apache.org/>`_. You can choose different parquet
  2657. backends, and have the option of compression. See
  2658. :ref:`the user guide <io.parquet>` for more details.
  2659. Parameters
  2660. ----------
  2661. path : str, path object, file-like object, or None, default None
  2662. String, path object (implementing ``os.PathLike[str]``), or file-like
  2663. object implementing a binary ``write()`` function. If None, the result is
  2664. returned as bytes. If a string or path, it will be used as Root Directory
  2665. path when writing a partitioned dataset.
  2666. engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
  2667. Parquet library to use. If 'auto', then the option
  2668. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  2669. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  2670. 'pyarrow' is unavailable.
  2671. compression : str or None, default 'snappy'
  2672. Name of the compression to use. Use ``None`` for no compression.
  2673. Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'.
  2674. index : bool, default None
  2675. If ``True``, include the dataframe's index(es) in the file output.
  2676. If ``False``, they will not be written to the file.
  2677. If ``None``, similar to ``True`` the dataframe's index(es)
  2678. will be saved. However, instead of being saved as values,
  2679. the RangeIndex will be stored as a range in the metadata so it
  2680. doesn't require much space and is faster. Other indexes will
  2681. be included as columns in the file output.
  2682. partition_cols : list, optional, default None
  2683. Column names by which to partition the dataset.
  2684. Columns are partitioned in the order they are given.
  2685. Must be None if path is not a string.
  2686. {storage_options}
  2687. **kwargs
  2688. Additional arguments passed to the parquet library. See
  2689. :ref:`pandas io <io.parquet>` for more details.
  2690. Returns
  2691. -------
  2692. bytes if no path argument is provided else None
  2693. See Also
  2694. --------
  2695. read_parquet : Read a parquet file.
  2696. DataFrame.to_orc : Write an orc file.
  2697. DataFrame.to_csv : Write a csv file.
  2698. DataFrame.to_sql : Write to a sql table.
  2699. DataFrame.to_hdf : Write to hdf.
  2700. Notes
  2701. -----
  2702. This function requires either the `fastparquet
  2703. <https://pypi.org/project/fastparquet>`_ or `pyarrow
  2704. <https://arrow.apache.org/docs/python/>`_ library.
  2705. Examples
  2706. --------
  2707. >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
  2708. >>> df.to_parquet('df.parquet.gzip',
  2709. ... compression='gzip') # doctest: +SKIP
  2710. >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
  2711. col1 col2
  2712. 0 1 3
  2713. 1 2 4
  2714. If you want to get a buffer to the parquet content you can use a io.BytesIO
  2715. object, as long as you don't use partition_cols, which creates multiple files.
  2716. >>> import io
  2717. >>> f = io.BytesIO()
  2718. >>> df.to_parquet(f)
  2719. >>> f.seek(0)
  2720. 0
  2721. >>> content = f.read()
  2722. """
  2723. from pandas.io.parquet import to_parquet
  2724. return to_parquet(
  2725. self,
  2726. path,
  2727. engine,
  2728. compression=compression,
  2729. index=index,
  2730. partition_cols=partition_cols,
  2731. storage_options=storage_options,
  2732. **kwargs,
  2733. )
  2734. def to_orc(
  2735. self,
  2736. path: FilePath | WriteBuffer[bytes] | None = None,
  2737. *,
  2738. engine: Literal["pyarrow"] = "pyarrow",
  2739. index: bool | None = None,
  2740. engine_kwargs: dict[str, Any] | None = None,
  2741. ) -> bytes | None:
  2742. """
  2743. Write a DataFrame to the ORC format.
  2744. .. versionadded:: 1.5.0
  2745. Parameters
  2746. ----------
  2747. path : str, file-like object or None, default None
  2748. If a string, it will be used as Root Directory path
  2749. when writing a partitioned dataset. By file-like object,
  2750. we refer to objects with a write() method, such as a file handle
  2751. (e.g. via builtin open function). If path is None,
  2752. a bytes object is returned.
  2753. engine : {'pyarrow'}, default 'pyarrow'
  2754. ORC library to use.
  2755. index : bool, optional
  2756. If ``True``, include the dataframe's index(es) in the file output.
  2757. If ``False``, they will not be written to the file.
  2758. If ``None``, similar to ``infer`` the dataframe's index(es)
  2759. will be saved. However, instead of being saved as values,
  2760. the RangeIndex will be stored as a range in the metadata so it
  2761. doesn't require much space and is faster. Other indexes will
  2762. be included as columns in the file output.
  2763. engine_kwargs : dict[str, Any] or None, default None
  2764. Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
  2765. Returns
  2766. -------
  2767. bytes if no path argument is provided else None
  2768. Raises
  2769. ------
  2770. NotImplementedError
  2771. Dtype of one or more columns is category, unsigned integers, interval,
  2772. period or sparse.
  2773. ValueError
  2774. engine is not pyarrow.
  2775. See Also
  2776. --------
  2777. read_orc : Read a ORC file.
  2778. DataFrame.to_parquet : Write a parquet file.
  2779. DataFrame.to_csv : Write a csv file.
  2780. DataFrame.to_sql : Write to a sql table.
  2781. DataFrame.to_hdf : Write to hdf.
  2782. Notes
  2783. -----
  2784. * Before using this function you should read the :ref:`user guide about
  2785. ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
  2786. * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
  2787. library.
  2788. * For supported dtypes please refer to `supported ORC features in Arrow
  2789. <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
  2790. * Currently timezones in datetime columns are not preserved when a
  2791. dataframe is converted into ORC files.
  2792. Examples
  2793. --------
  2794. >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
  2795. >>> df.to_orc('df.orc') # doctest: +SKIP
  2796. >>> pd.read_orc('df.orc') # doctest: +SKIP
  2797. col1 col2
  2798. 0 1 4
  2799. 1 2 3
  2800. If you want to get a buffer to the orc content you can write it to io.BytesIO
  2801. >>> import io
  2802. >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
  2803. >>> b.seek(0) # doctest: +SKIP
  2804. 0
  2805. >>> content = b.read() # doctest: +SKIP
  2806. """
  2807. from pandas.io.orc import to_orc
  2808. return to_orc(
  2809. self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
  2810. )
  2811. @overload
  2812. def to_html(
  2813. self,
  2814. buf: FilePath | WriteBuffer[str],
  2815. columns: Axes | None = ...,
  2816. col_space: ColspaceArgType | None = ...,
  2817. header: bool = ...,
  2818. index: bool = ...,
  2819. na_rep: str = ...,
  2820. formatters: FormattersType | None = ...,
  2821. float_format: FloatFormatType | None = ...,
  2822. sparsify: bool | None = ...,
  2823. index_names: bool = ...,
  2824. justify: str | None = ...,
  2825. max_rows: int | None = ...,
  2826. max_cols: int | None = ...,
  2827. show_dimensions: bool | str = ...,
  2828. decimal: str = ...,
  2829. bold_rows: bool = ...,
  2830. classes: str | list | tuple | None = ...,
  2831. escape: bool = ...,
  2832. notebook: bool = ...,
  2833. border: int | bool | None = ...,
  2834. table_id: str | None = ...,
  2835. render_links: bool = ...,
  2836. encoding: str | None = ...,
  2837. ) -> None:
  2838. ...
  2839. @overload
  2840. def to_html(
  2841. self,
  2842. buf: None = ...,
  2843. columns: Axes | None = ...,
  2844. col_space: ColspaceArgType | None = ...,
  2845. header: bool = ...,
  2846. index: bool = ...,
  2847. na_rep: str = ...,
  2848. formatters: FormattersType | None = ...,
  2849. float_format: FloatFormatType | None = ...,
  2850. sparsify: bool | None = ...,
  2851. index_names: bool = ...,
  2852. justify: str | None = ...,
  2853. max_rows: int | None = ...,
  2854. max_cols: int | None = ...,
  2855. show_dimensions: bool | str = ...,
  2856. decimal: str = ...,
  2857. bold_rows: bool = ...,
  2858. classes: str | list | tuple | None = ...,
  2859. escape: bool = ...,
  2860. notebook: bool = ...,
  2861. border: int | bool | None = ...,
  2862. table_id: str | None = ...,
  2863. render_links: bool = ...,
  2864. encoding: str | None = ...,
  2865. ) -> str:
  2866. ...
  2867. @deprecate_nonkeyword_arguments(
  2868. version="3.0", allowed_args=["self", "buf"], name="to_html"
  2869. )
  2870. @Substitution(
  2871. header_type="bool",
  2872. header="Whether to print column labels, default True",
  2873. col_space_type="str or int, list or dict of int or str",
  2874. col_space="The minimum width of each column in CSS length "
  2875. "units. An int is assumed to be px units.",
  2876. )
  2877. @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
  2878. def to_html(
  2879. self,
  2880. buf: FilePath | WriteBuffer[str] | None = None,
  2881. columns: Axes | None = None,
  2882. col_space: ColspaceArgType | None = None,
  2883. header: bool = True,
  2884. index: bool = True,
  2885. na_rep: str = "NaN",
  2886. formatters: FormattersType | None = None,
  2887. float_format: FloatFormatType | None = None,
  2888. sparsify: bool | None = None,
  2889. index_names: bool = True,
  2890. justify: str | None = None,
  2891. max_rows: int | None = None,
  2892. max_cols: int | None = None,
  2893. show_dimensions: bool | str = False,
  2894. decimal: str = ".",
  2895. bold_rows: bool = True,
  2896. classes: str | list | tuple | None = None,
  2897. escape: bool = True,
  2898. notebook: bool = False,
  2899. border: int | bool | None = None,
  2900. table_id: str | None = None,
  2901. render_links: bool = False,
  2902. encoding: str | None = None,
  2903. ) -> str | None:
  2904. """
  2905. Render a DataFrame as an HTML table.
  2906. %(shared_params)s
  2907. bold_rows : bool, default True
  2908. Make the row labels bold in the output.
  2909. classes : str or list or tuple, default None
  2910. CSS class(es) to apply to the resulting html table.
  2911. escape : bool, default True
  2912. Convert the characters <, >, and & to HTML-safe sequences.
  2913. notebook : {True, False}, default False
  2914. Whether the generated HTML is for IPython Notebook.
  2915. border : int
  2916. A ``border=border`` attribute is included in the opening
  2917. `<table>` tag. Default ``pd.options.display.html.border``.
  2918. table_id : str, optional
  2919. A css id is included in the opening `<table>` tag if specified.
  2920. render_links : bool, default False
  2921. Convert URLs to HTML links.
  2922. encoding : str, default "utf-8"
  2923. Set character encoding.
  2924. %(returns)s
  2925. See Also
  2926. --------
  2927. to_string : Convert DataFrame to a string.
  2928. Examples
  2929. --------
  2930. >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
  2931. >>> html_string = '''<table border="1" class="dataframe">
  2932. ... <thead>
  2933. ... <tr style="text-align: right;">
  2934. ... <th></th>
  2935. ... <th>col1</th>
  2936. ... <th>col2</th>
  2937. ... </tr>
  2938. ... </thead>
  2939. ... <tbody>
  2940. ... <tr>
  2941. ... <th>0</th>
  2942. ... <td>1</td>
  2943. ... <td>4</td>
  2944. ... </tr>
  2945. ... <tr>
  2946. ... <th>1</th>
  2947. ... <td>2</td>
  2948. ... <td>3</td>
  2949. ... </tr>
  2950. ... </tbody>
  2951. ... </table>'''
  2952. >>> assert html_string == df.to_html()
  2953. """
  2954. if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS:
  2955. raise ValueError("Invalid value for justify parameter")
  2956. formatter = fmt.DataFrameFormatter(
  2957. self,
  2958. columns=columns,
  2959. col_space=col_space,
  2960. na_rep=na_rep,
  2961. header=header,
  2962. index=index,
  2963. formatters=formatters,
  2964. float_format=float_format,
  2965. bold_rows=bold_rows,
  2966. sparsify=sparsify,
  2967. justify=justify,
  2968. index_names=index_names,
  2969. escape=escape,
  2970. decimal=decimal,
  2971. max_rows=max_rows,
  2972. max_cols=max_cols,
  2973. show_dimensions=show_dimensions,
  2974. )
  2975. # TODO: a generic formatter wld b in DataFrameFormatter
  2976. return fmt.DataFrameRenderer(formatter).to_html(
  2977. buf=buf,
  2978. classes=classes,
  2979. notebook=notebook,
  2980. border=border,
  2981. encoding=encoding,
  2982. table_id=table_id,
  2983. render_links=render_links,
  2984. )
  2985. @overload
  2986. def to_xml(
  2987. self,
  2988. path_or_buffer: None = ...,
  2989. *,
  2990. index: bool = ...,
  2991. root_name: str | None = ...,
  2992. row_name: str | None = ...,
  2993. na_rep: str | None = ...,
  2994. attr_cols: list[str] | None = ...,
  2995. elem_cols: list[str] | None = ...,
  2996. namespaces: dict[str | None, str] | None = ...,
  2997. prefix: str | None = ...,
  2998. encoding: str = ...,
  2999. xml_declaration: bool | None = ...,
  3000. pretty_print: bool | None = ...,
  3001. parser: XMLParsers | None = ...,
  3002. stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,
  3003. compression: CompressionOptions = ...,
  3004. storage_options: StorageOptions | None = ...,
  3005. ) -> str:
  3006. ...
  3007. @overload
  3008. def to_xml(
  3009. self,
  3010. path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
  3011. *,
  3012. index: bool = ...,
  3013. root_name: str | None = ...,
  3014. row_name: str | None = ...,
  3015. na_rep: str | None = ...,
  3016. attr_cols: list[str] | None = ...,
  3017. elem_cols: list[str] | None = ...,
  3018. namespaces: dict[str | None, str] | None = ...,
  3019. prefix: str | None = ...,
  3020. encoding: str = ...,
  3021. xml_declaration: bool | None = ...,
  3022. pretty_print: bool | None = ...,
  3023. parser: XMLParsers | None = ...,
  3024. stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,
  3025. compression: CompressionOptions = ...,
  3026. storage_options: StorageOptions | None = ...,
  3027. ) -> None:
  3028. ...
  3029. @deprecate_nonkeyword_arguments(
  3030. version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml"
  3031. )
  3032. @doc(
  3033. storage_options=_shared_docs["storage_options"],
  3034. compression_options=_shared_docs["compression_options"] % "path_or_buffer",
  3035. )
  3036. def to_xml(
  3037. self,
  3038. path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
  3039. index: bool = True,
  3040. root_name: str | None = "data",
  3041. row_name: str | None = "row",
  3042. na_rep: str | None = None,
  3043. attr_cols: list[str] | None = None,
  3044. elem_cols: list[str] | None = None,
  3045. namespaces: dict[str | None, str] | None = None,
  3046. prefix: str | None = None,
  3047. encoding: str = "utf-8",
  3048. xml_declaration: bool | None = True,
  3049. pretty_print: bool | None = True,
  3050. parser: XMLParsers | None = "lxml",
  3051. stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
  3052. compression: CompressionOptions = "infer",
  3053. storage_options: StorageOptions | None = None,
  3054. ) -> str | None:
  3055. """
  3056. Render a DataFrame to an XML document.
  3057. .. versionadded:: 1.3.0
  3058. Parameters
  3059. ----------
  3060. path_or_buffer : str, path object, file-like object, or None, default None
  3061. String, path object (implementing ``os.PathLike[str]``), or file-like
  3062. object implementing a ``write()`` function. If None, the result is returned
  3063. as a string.
  3064. index : bool, default True
  3065. Whether to include index in XML document.
  3066. root_name : str, default 'data'
  3067. The name of root element in XML document.
  3068. row_name : str, default 'row'
  3069. The name of row element in XML document.
  3070. na_rep : str, optional
  3071. Missing data representation.
  3072. attr_cols : list-like, optional
  3073. List of columns to write as attributes in row element.
  3074. Hierarchical columns will be flattened with underscore
  3075. delimiting the different levels.
  3076. elem_cols : list-like, optional
  3077. List of columns to write as children in row element. By default,
  3078. all columns output as children of row element. Hierarchical
  3079. columns will be flattened with underscore delimiting the
  3080. different levels.
  3081. namespaces : dict, optional
  3082. All namespaces to be defined in root element. Keys of dict
  3083. should be prefix names and values of dict corresponding URIs.
  3084. Default namespaces should be given empty string key. For
  3085. example, ::
  3086. namespaces = {{"": "https://example.com"}}
  3087. prefix : str, optional
  3088. Namespace prefix to be used for every element and/or attribute
  3089. in document. This should be one of the keys in ``namespaces``
  3090. dict.
  3091. encoding : str, default 'utf-8'
  3092. Encoding of the resulting document.
  3093. xml_declaration : bool, default True
  3094. Whether to include the XML declaration at start of document.
  3095. pretty_print : bool, default True
  3096. Whether output should be pretty printed with indentation and
  3097. line breaks.
  3098. parser : {{'lxml','etree'}}, default 'lxml'
  3099. Parser module to use for building of tree. Only 'lxml' and
  3100. 'etree' are supported. With 'lxml', the ability to use XSLT
  3101. stylesheet is supported.
  3102. stylesheet : str, path object or file-like object, optional
  3103. A URL, file-like object, or a raw string containing an XSLT
  3104. script used to transform the raw XML output. Script should use
  3105. layout of elements and attributes from original output. This
  3106. argument requires ``lxml`` to be installed. Only XSLT 1.0
  3107. scripts and not later versions is currently supported.
  3108. {compression_options}
  3109. .. versionchanged:: 1.4.0 Zstandard support.
  3110. {storage_options}
  3111. Returns
  3112. -------
  3113. None or str
  3114. If ``io`` is None, returns the resulting XML format as a
  3115. string. Otherwise returns None.
  3116. See Also
  3117. --------
  3118. to_json : Convert the pandas object to a JSON string.
  3119. to_html : Convert DataFrame to a html.
  3120. Examples
  3121. --------
  3122. >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
  3123. ... 'degrees': [360, 360, 180],
  3124. ... 'sides': [4, np.nan, 3]}})
  3125. >>> df.to_xml() # doctest: +SKIP
  3126. <?xml version='1.0' encoding='utf-8'?>
  3127. <data>
  3128. <row>
  3129. <index>0</index>
  3130. <shape>square</shape>
  3131. <degrees>360</degrees>
  3132. <sides>4.0</sides>
  3133. </row>
  3134. <row>
  3135. <index>1</index>
  3136. <shape>circle</shape>
  3137. <degrees>360</degrees>
  3138. <sides/>
  3139. </row>
  3140. <row>
  3141. <index>2</index>
  3142. <shape>triangle</shape>
  3143. <degrees>180</degrees>
  3144. <sides>3.0</sides>
  3145. </row>
  3146. </data>
  3147. >>> df.to_xml(attr_cols=[
  3148. ... 'index', 'shape', 'degrees', 'sides'
  3149. ... ]) # doctest: +SKIP
  3150. <?xml version='1.0' encoding='utf-8'?>
  3151. <data>
  3152. <row index="0" shape="square" degrees="360" sides="4.0"/>
  3153. <row index="1" shape="circle" degrees="360"/>
  3154. <row index="2" shape="triangle" degrees="180" sides="3.0"/>
  3155. </data>
  3156. >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
  3157. ... prefix="doc") # doctest: +SKIP
  3158. <?xml version='1.0' encoding='utf-8'?>
  3159. <doc:data xmlns:doc="https://example.com">
  3160. <doc:row>
  3161. <doc:index>0</doc:index>
  3162. <doc:shape>square</doc:shape>
  3163. <doc:degrees>360</doc:degrees>
  3164. <doc:sides>4.0</doc:sides>
  3165. </doc:row>
  3166. <doc:row>
  3167. <doc:index>1</doc:index>
  3168. <doc:shape>circle</doc:shape>
  3169. <doc:degrees>360</doc:degrees>
  3170. <doc:sides/>
  3171. </doc:row>
  3172. <doc:row>
  3173. <doc:index>2</doc:index>
  3174. <doc:shape>triangle</doc:shape>
  3175. <doc:degrees>180</doc:degrees>
  3176. <doc:sides>3.0</doc:sides>
  3177. </doc:row>
  3178. </doc:data>
  3179. """
  3180. from pandas.io.formats.xml import (
  3181. EtreeXMLFormatter,
  3182. LxmlXMLFormatter,
  3183. )
  3184. lxml = import_optional_dependency("lxml.etree", errors="ignore")
  3185. TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter]
  3186. if parser == "lxml":
  3187. if lxml is not None:
  3188. TreeBuilder = LxmlXMLFormatter
  3189. else:
  3190. raise ImportError(
  3191. "lxml not found, please install or use the etree parser."
  3192. )
  3193. elif parser == "etree":
  3194. TreeBuilder = EtreeXMLFormatter
  3195. else:
  3196. raise ValueError("Values for parser can only be lxml or etree.")
  3197. xml_formatter = TreeBuilder(
  3198. self,
  3199. path_or_buffer=path_or_buffer,
  3200. index=index,
  3201. root_name=root_name,
  3202. row_name=row_name,
  3203. na_rep=na_rep,
  3204. attr_cols=attr_cols,
  3205. elem_cols=elem_cols,
  3206. namespaces=namespaces,
  3207. prefix=prefix,
  3208. encoding=encoding,
  3209. xml_declaration=xml_declaration,
  3210. pretty_print=pretty_print,
  3211. stylesheet=stylesheet,
  3212. compression=compression,
  3213. storage_options=storage_options,
  3214. )
  3215. return xml_formatter.write_output()
  3216. # ----------------------------------------------------------------------
  3217. @doc(INFO_DOCSTRING, **frame_sub_kwargs)
  3218. def info(
  3219. self,
  3220. verbose: bool | None = None,
  3221. buf: WriteBuffer[str] | None = None,
  3222. max_cols: int | None = None,
  3223. memory_usage: bool | str | None = None,
  3224. show_counts: bool | None = None,
  3225. ) -> None:
  3226. info = DataFrameInfo(
  3227. data=self,
  3228. memory_usage=memory_usage,
  3229. )
  3230. info.render(
  3231. buf=buf,
  3232. max_cols=max_cols,
  3233. verbose=verbose,
  3234. show_counts=show_counts,
  3235. )
  3236. def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
  3237. """
  3238. Return the memory usage of each column in bytes.
  3239. The memory usage can optionally include the contribution of
  3240. the index and elements of `object` dtype.
  3241. This value is displayed in `DataFrame.info` by default. This can be
  3242. suppressed by setting ``pandas.options.display.memory_usage`` to False.
  3243. Parameters
  3244. ----------
  3245. index : bool, default True
  3246. Specifies whether to include the memory usage of the DataFrame's
  3247. index in returned Series. If ``index=True``, the memory usage of
  3248. the index is the first item in the output.
  3249. deep : bool, default False
  3250. If True, introspect the data deeply by interrogating
  3251. `object` dtypes for system-level memory consumption, and include
  3252. it in the returned values.
  3253. Returns
  3254. -------
  3255. Series
  3256. A Series whose index is the original column names and whose values
  3257. is the memory usage of each column in bytes.
  3258. See Also
  3259. --------
  3260. numpy.ndarray.nbytes : Total bytes consumed by the elements of an
  3261. ndarray.
  3262. Series.memory_usage : Bytes consumed by a Series.
  3263. Categorical : Memory-efficient array for string values with
  3264. many repeated values.
  3265. DataFrame.info : Concise summary of a DataFrame.
  3266. Notes
  3267. -----
  3268. See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
  3269. details.
  3270. Examples
  3271. --------
  3272. >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
  3273. >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
  3274. ... for t in dtypes])
  3275. >>> df = pd.DataFrame(data)
  3276. >>> df.head()
  3277. int64 float64 complex128 object bool
  3278. 0 1 1.0 1.0+0.0j 1 True
  3279. 1 1 1.0 1.0+0.0j 1 True
  3280. 2 1 1.0 1.0+0.0j 1 True
  3281. 3 1 1.0 1.0+0.0j 1 True
  3282. 4 1 1.0 1.0+0.0j 1 True
  3283. >>> df.memory_usage()
  3284. Index 128
  3285. int64 40000
  3286. float64 40000
  3287. complex128 80000
  3288. object 40000
  3289. bool 5000
  3290. dtype: int64
  3291. >>> df.memory_usage(index=False)
  3292. int64 40000
  3293. float64 40000
  3294. complex128 80000
  3295. object 40000
  3296. bool 5000
  3297. dtype: int64
  3298. The memory footprint of `object` dtype columns is ignored by default:
  3299. >>> df.memory_usage(deep=True)
  3300. Index 128
  3301. int64 40000
  3302. float64 40000
  3303. complex128 80000
  3304. object 180000
  3305. bool 5000
  3306. dtype: int64
  3307. Use a Categorical for efficient storage of an object-dtype column with
  3308. many repeated values.
  3309. >>> df['object'].astype('category').memory_usage(deep=True)
  3310. 5244
  3311. """
  3312. result = self._constructor_sliced(
  3313. [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
  3314. index=self.columns,
  3315. dtype=np.intp,
  3316. )
  3317. if index:
  3318. index_memory_usage = self._constructor_sliced(
  3319. self.index.memory_usage(deep=deep), index=["Index"]
  3320. )
  3321. result = index_memory_usage._append(result)
  3322. return result
  3323. def transpose(self, *args, copy: bool = False) -> DataFrame:
  3324. """
  3325. Transpose index and columns.
  3326. Reflect the DataFrame over its main diagonal by writing rows as columns
  3327. and vice-versa. The property :attr:`.T` is an accessor to the method
  3328. :meth:`transpose`.
  3329. Parameters
  3330. ----------
  3331. *args : tuple, optional
  3332. Accepted for compatibility with NumPy.
  3333. copy : bool, default False
  3334. Whether to copy the data after transposing, even for DataFrames
  3335. with a single dtype.
  3336. Note that a copy is always required for mixed dtype DataFrames,
  3337. or for DataFrames with any extension types.
  3338. .. note::
  3339. The `copy` keyword will change behavior in pandas 3.0.
  3340. `Copy-on-Write
  3341. <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
  3342. will be enabled by default, which means that all methods with a
  3343. `copy` keyword will use a lazy copy mechanism to defer the copy and
  3344. ignore the `copy` keyword. The `copy` keyword will be removed in a
  3345. future version of pandas.
  3346. You can already get the future behavior and improvements through
  3347. enabling copy on write ``pd.options.mode.copy_on_write = True``
  3348. Returns
  3349. -------
  3350. DataFrame
  3351. The transposed DataFrame.
  3352. See Also
  3353. --------
  3354. numpy.transpose : Permute the dimensions of a given array.
  3355. Notes
  3356. -----
  3357. Transposing a DataFrame with mixed dtypes will result in a homogeneous
  3358. DataFrame with the `object` dtype. In such a case, a copy of the data
  3359. is always made.
  3360. Examples
  3361. --------
  3362. **Square DataFrame with homogeneous dtype**
  3363. >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
  3364. >>> df1 = pd.DataFrame(data=d1)
  3365. >>> df1
  3366. col1 col2
  3367. 0 1 3
  3368. 1 2 4
  3369. >>> df1_transposed = df1.T # or df1.transpose()
  3370. >>> df1_transposed
  3371. 0 1
  3372. col1 1 2
  3373. col2 3 4
  3374. When the dtype is homogeneous in the original DataFrame, we get a
  3375. transposed DataFrame with the same dtype:
  3376. >>> df1.dtypes
  3377. col1 int64
  3378. col2 int64
  3379. dtype: object
  3380. >>> df1_transposed.dtypes
  3381. 0 int64
  3382. 1 int64
  3383. dtype: object
  3384. **Non-square DataFrame with mixed dtypes**
  3385. >>> d2 = {'name': ['Alice', 'Bob'],
  3386. ... 'score': [9.5, 8],
  3387. ... 'employed': [False, True],
  3388. ... 'kids': [0, 0]}
  3389. >>> df2 = pd.DataFrame(data=d2)
  3390. >>> df2
  3391. name score employed kids
  3392. 0 Alice 9.5 False 0
  3393. 1 Bob 8.0 True 0
  3394. >>> df2_transposed = df2.T # or df2.transpose()
  3395. >>> df2_transposed
  3396. 0 1
  3397. name Alice Bob
  3398. score 9.5 8.0
  3399. employed False True
  3400. kids 0 0
  3401. When the DataFrame has mixed dtypes, we get a transposed DataFrame with
  3402. the `object` dtype:
  3403. >>> df2.dtypes
  3404. name object
  3405. score float64
  3406. employed bool
  3407. kids int64
  3408. dtype: object
  3409. >>> df2_transposed.dtypes
  3410. 0 object
  3411. 1 object
  3412. dtype: object
  3413. """
  3414. nv.validate_transpose(args, {})
  3415. # construct the args
  3416. dtypes = list(self.dtypes)
  3417. if self._can_fast_transpose:
  3418. # Note: tests pass without this, but this improves perf quite a bit.
  3419. new_vals = self._values.T
  3420. if copy and not using_copy_on_write():
  3421. new_vals = new_vals.copy()
  3422. result = self._constructor(
  3423. new_vals,
  3424. index=self.columns,
  3425. columns=self.index,
  3426. copy=False,
  3427. dtype=new_vals.dtype,
  3428. )
  3429. if using_copy_on_write() and len(self) > 0:
  3430. result._mgr.add_references(self._mgr) # type: ignore[arg-type]
  3431. elif (
  3432. self._is_homogeneous_type
  3433. and dtypes
  3434. and isinstance(dtypes[0], ExtensionDtype)
  3435. ):
  3436. new_values: list
  3437. if isinstance(dtypes[0], BaseMaskedDtype):
  3438. # We have masked arrays with the same dtype. We can transpose faster.
  3439. from pandas.core.arrays.masked import (
  3440. transpose_homogeneous_masked_arrays,
  3441. )
  3442. new_values = transpose_homogeneous_masked_arrays(
  3443. cast(Sequence[BaseMaskedArray], self._iter_column_arrays())
  3444. )
  3445. elif isinstance(dtypes[0], ArrowDtype):
  3446. # We have arrow EAs with the same dtype. We can transpose faster.
  3447. from pandas.core.arrays.arrow.array import (
  3448. ArrowExtensionArray,
  3449. transpose_homogeneous_pyarrow,
  3450. )
  3451. new_values = transpose_homogeneous_pyarrow(
  3452. cast(Sequence[ArrowExtensionArray], self._iter_column_arrays())
  3453. )
  3454. else:
  3455. # We have other EAs with the same dtype. We preserve dtype in transpose.
  3456. dtyp = dtypes[0]
  3457. arr_typ = dtyp.construct_array_type()
  3458. values = self.values
  3459. new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values]
  3460. result = type(self)._from_arrays(
  3461. new_values,
  3462. index=self.columns,
  3463. columns=self.index,
  3464. verify_integrity=False,
  3465. )
  3466. else:
  3467. new_arr = self.values.T
  3468. if copy and not using_copy_on_write():
  3469. new_arr = new_arr.copy()
  3470. result = self._constructor(
  3471. new_arr,
  3472. index=self.columns,
  3473. columns=self.index,
  3474. dtype=new_arr.dtype,
  3475. # We already made a copy (more than one block)
  3476. copy=False,
  3477. )
  3478. return result.__finalize__(self, method="transpose")
  3479. @property
  3480. def T(self) -> DataFrame:
  3481. """
  3482. The transpose of the DataFrame.
  3483. Returns
  3484. -------
  3485. DataFrame
  3486. The transposed DataFrame.
  3487. See Also
  3488. --------
  3489. DataFrame.transpose : Transpose index and columns.
  3490. Examples
  3491. --------
  3492. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  3493. >>> df
  3494. col1 col2
  3495. 0 1 3
  3496. 1 2 4
  3497. >>> df.T
  3498. 0 1
  3499. col1 1 2
  3500. col2 3 4
  3501. """
  3502. return self.transpose()
  3503. # ----------------------------------------------------------------------
  3504. # Indexing Methods
  3505. def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
  3506. """
  3507. Parameters
  3508. ----------
  3509. i : int
  3510. axis : int
  3511. Returns
  3512. -------
  3513. Series
  3514. """
  3515. # irow
  3516. if axis == 0:
  3517. new_mgr = self._mgr.fast_xs(i)
  3518. # if we are a copy, mark as such
  3519. copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
  3520. result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
  3521. result._name = self.index[i]
  3522. result = result.__finalize__(self)
  3523. result._set_is_copy(self, copy=copy)
  3524. return result
  3525. # icol
  3526. else:
  3527. label = self.columns[i]
  3528. col_mgr = self._mgr.iget(i)
  3529. result = self._box_col_values(col_mgr, i)
  3530. # this is a cached value, mark it so
  3531. result._set_as_cached(label, self)
  3532. return result
  3533. def _get_column_array(self, i: int) -> ArrayLike:
  3534. """
  3535. Get the values of the i'th column (ndarray or ExtensionArray, as stored
  3536. in the Block)
  3537. Warning! The returned array is a view but doesn't handle Copy-on-Write,
  3538. so this should be used with caution (for read-only purposes).
  3539. """
  3540. return self._mgr.iget_values(i)
  3541. def _iter_column_arrays(self) -> Iterator[ArrayLike]:
  3542. """
  3543. Iterate over the arrays of all columns in order.
  3544. This returns the values as stored in the Block (ndarray or ExtensionArray).
  3545. Warning! The returned array is a view but doesn't handle Copy-on-Write,
  3546. so this should be used with caution (for read-only purposes).
  3547. """
  3548. if isinstance(self._mgr, ArrayManager):
  3549. yield from self._mgr.arrays
  3550. else:
  3551. for i in range(len(self.columns)):
  3552. yield self._get_column_array(i)
  3553. def _getitem_nocopy(self, key: list):
  3554. """
  3555. Behaves like __getitem__, but returns a view in cases where __getitem__
  3556. would make a copy.
  3557. """
  3558. # TODO(CoW): can be removed if/when we are always Copy-on-Write
  3559. indexer = self.columns._get_indexer_strict(key, "columns")[1]
  3560. new_axis = self.columns[indexer]
  3561. new_mgr = self._mgr.reindex_indexer(
  3562. new_axis,
  3563. indexer,
  3564. axis=0,
  3565. allow_dups=True,
  3566. copy=False,
  3567. only_slice=True,
  3568. )
  3569. result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
  3570. result = result.__finalize__(self)
  3571. return result
  3572. def __getitem__(self, key):
  3573. check_dict_or_set_indexers(key)
  3574. key = lib.item_from_zerodim(key)
  3575. key = com.apply_if_callable(key, self)
  3576. if is_hashable(key) and not is_iterator(key):
  3577. # is_iterator to exclude generator e.g. test_getitem_listlike
  3578. # shortcut if the key is in columns
  3579. is_mi = isinstance(self.columns, MultiIndex)
  3580. # GH#45316 Return view if key is not duplicated
  3581. # Only use drop_duplicates with duplicates for performance
  3582. if not is_mi and (
  3583. self.columns.is_unique
  3584. and key in self.columns
  3585. or key in self.columns.drop_duplicates(keep=False)
  3586. ):
  3587. return self._get_item_cache(key)
  3588. elif is_mi and self.columns.is_unique and key in self.columns:
  3589. return self._getitem_multilevel(key)
  3590. # Do we have a slicer (on rows)?
  3591. if isinstance(key, slice):
  3592. return self._getitem_slice(key)
  3593. # Do we have a (boolean) DataFrame?
  3594. if isinstance(key, DataFrame):
  3595. return self.where(key)
  3596. # Do we have a (boolean) 1d indexer?
  3597. if com.is_bool_indexer(key):
  3598. return self._getitem_bool_array(key)
  3599. # We are left with two options: a single key, and a collection of keys,
  3600. # We interpret tuples as collections only for non-MultiIndex
  3601. is_single_key = isinstance(key, tuple) or not is_list_like(key)
  3602. if is_single_key:
  3603. if self.columns.nlevels > 1:
  3604. return self._getitem_multilevel(key)
  3605. indexer = self.columns.get_loc(key)
  3606. if is_integer(indexer):
  3607. indexer = [indexer]
  3608. else:
  3609. if is_iterator(key):
  3610. key = list(key)
  3611. indexer = self.columns._get_indexer_strict(key, "columns")[1]
  3612. # take() does not accept boolean indexers
  3613. if getattr(indexer, "dtype", None) == bool:
  3614. indexer = np.where(indexer)[0]
  3615. if isinstance(indexer, slice):
  3616. return self._slice(indexer, axis=1)
  3617. data = self._take_with_is_copy(indexer, axis=1)
  3618. if is_single_key:
  3619. # What does looking for a single key in a non-unique index return?
  3620. # The behavior is inconsistent. It returns a Series, except when
  3621. # - the key itself is repeated (test on data.shape, #9519), or
  3622. # - we have a MultiIndex on columns (test on self.columns, #21309)
  3623. if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
  3624. # GH#26490 using data[key] can cause RecursionError
  3625. return data._get_item_cache(key)
  3626. return data
  3627. def _getitem_bool_array(self, key):
  3628. # also raises Exception if object array with NA values
  3629. # warning here just in case -- previously __setitem__ was
  3630. # reindexing but __getitem__ was not; it seems more reasonable to
  3631. # go with the __setitem__ behavior since that is more consistent
  3632. # with all other indexing behavior
  3633. if isinstance(key, Series) and not key.index.equals(self.index):
  3634. warnings.warn(
  3635. "Boolean Series key will be reindexed to match DataFrame index.",
  3636. UserWarning,
  3637. stacklevel=find_stack_level(),
  3638. )
  3639. elif len(key) != len(self.index):
  3640. raise ValueError(
  3641. f"Item wrong length {len(key)} instead of {len(self.index)}."
  3642. )
  3643. # check_bool_indexer will throw exception if Series key cannot
  3644. # be reindexed to match DataFrame rows
  3645. key = check_bool_indexer(self.index, key)
  3646. if key.all():
  3647. return self.copy(deep=None)
  3648. indexer = key.nonzero()[0]
  3649. return self._take_with_is_copy(indexer, axis=0)
  3650. def _getitem_multilevel(self, key):
  3651. # self.columns is a MultiIndex
  3652. loc = self.columns.get_loc(key)
  3653. if isinstance(loc, (slice, np.ndarray)):
  3654. new_columns = self.columns[loc]
  3655. result_columns = maybe_droplevels(new_columns, key)
  3656. result = self.iloc[:, loc]
  3657. result.columns = result_columns
  3658. # If there is only one column being returned, and its name is
  3659. # either an empty string, or a tuple with an empty string as its
  3660. # first element, then treat the empty string as a placeholder
  3661. # and return the column as if the user had provided that empty
  3662. # string in the key. If the result is a Series, exclude the
  3663. # implied empty string from its name.
  3664. if len(result.columns) == 1:
  3665. # e.g. test_frame_getitem_multicolumn_empty_level,
  3666. # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
  3667. top = result.columns[0]
  3668. if isinstance(top, tuple):
  3669. top = top[0]
  3670. if top == "":
  3671. result = result[""]
  3672. if isinstance(result, Series):
  3673. result = self._constructor_sliced(
  3674. result, index=self.index, name=key
  3675. )
  3676. result._set_is_copy(self)
  3677. return result
  3678. else:
  3679. # loc is neither a slice nor ndarray, so must be an int
  3680. return self._ixs(loc, axis=1)
  3681. def _get_value(self, index, col, takeable: bool = False) -> Scalar:
  3682. """
  3683. Quickly retrieve single value at passed column and index.
  3684. Parameters
  3685. ----------
  3686. index : row label
  3687. col : column label
  3688. takeable : interpret the index/col as indexers, default False
  3689. Returns
  3690. -------
  3691. scalar
  3692. Notes
  3693. -----
  3694. Assumes that both `self.index._index_as_unique` and
  3695. `self.columns._index_as_unique`; Caller is responsible for checking.
  3696. """
  3697. if takeable:
  3698. series = self._ixs(col, axis=1)
  3699. return series._values[index]
  3700. series = self._get_item_cache(col)
  3701. engine = self.index._engine
  3702. if not isinstance(self.index, MultiIndex):
  3703. # CategoricalIndex: Trying to use the engine fastpath may give incorrect
  3704. # results if our categories are integers that dont match our codes
  3705. # IntervalIndex: IntervalTree has no get_loc
  3706. row = self.index.get_loc(index)
  3707. return series._values[row]
  3708. # For MultiIndex going through engine effectively restricts us to
  3709. # same-length tuples; see test_get_set_value_no_partial_indexing
  3710. loc = engine.get_loc(index)
  3711. return series._values[loc]
  3712. def isetitem(self, loc, value) -> None:
  3713. """
  3714. Set the given value in the column with position `loc`.
  3715. This is a positional analogue to ``__setitem__``.
  3716. Parameters
  3717. ----------
  3718. loc : int or sequence of ints
  3719. Index position for the column.
  3720. value : scalar or arraylike
  3721. Value(s) for the column.
  3722. Notes
  3723. -----
  3724. ``frame.isetitem(loc, value)`` is an in-place method as it will
  3725. modify the DataFrame in place (not returning a new object). In contrast to
  3726. ``frame.iloc[:, i] = value`` which will try to update the existing values in
  3727. place, ``frame.isetitem(loc, value)`` will not update the values of the column
  3728. itself in place, it will instead insert a new array.
  3729. In cases where ``frame.columns`` is unique, this is equivalent to
  3730. ``frame[frame.columns[i]] = value``.
  3731. """
  3732. if isinstance(value, DataFrame):
  3733. if is_integer(loc):
  3734. loc = [loc]
  3735. if len(loc) != len(value.columns):
  3736. raise ValueError(
  3737. f"Got {len(loc)} positions but value has {len(value.columns)} "
  3738. f"columns."
  3739. )
  3740. for i, idx in enumerate(loc):
  3741. arraylike, refs = self._sanitize_column(value.iloc[:, i])
  3742. self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs)
  3743. return
  3744. arraylike, refs = self._sanitize_column(value)
  3745. self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs)
  3746. def __setitem__(self, key, value) -> None:
  3747. if not PYPY and not WARNING_CHECK_DISABLED and using_copy_on_write():
  3748. if sys.getrefcount(self) <= 3:
  3749. warnings.warn(
  3750. _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
  3751. )
  3752. elif not PYPY and not WARNING_CHECK_DISABLED and not using_copy_on_write():
  3753. if sys.getrefcount(self) <= 3 and (
  3754. warn_copy_on_write()
  3755. or (
  3756. not warn_copy_on_write()
  3757. and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr]
  3758. )
  3759. ):
  3760. warnings.warn(
  3761. _chained_assignment_warning_msg, FutureWarning, stacklevel=2
  3762. )
  3763. key = com.apply_if_callable(key, self)
  3764. # see if we can slice the rows
  3765. if isinstance(key, slice):
  3766. slc = self.index._convert_slice_indexer(key, kind="getitem")
  3767. return self._setitem_slice(slc, value)
  3768. if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
  3769. self._setitem_frame(key, value)
  3770. elif isinstance(key, (Series, np.ndarray, list, Index)):
  3771. self._setitem_array(key, value)
  3772. elif isinstance(value, DataFrame):
  3773. self._set_item_frame_value(key, value)
  3774. elif (
  3775. is_list_like(value)
  3776. and not self.columns.is_unique
  3777. and 1 < len(self.columns.get_indexer_for([key])) == len(value)
  3778. ):
  3779. # Column to set is duplicated
  3780. self._setitem_array([key], value)
  3781. else:
  3782. # set column
  3783. self._set_item(key, value)
  3784. def _setitem_slice(self, key: slice, value) -> None:
  3785. # NB: we can't just use self.loc[key] = value because that
  3786. # operates on labels and we need to operate positional for
  3787. # backwards-compat, xref GH#31469
  3788. self._check_setitem_copy()
  3789. self.iloc[key] = value
  3790. def _setitem_array(self, key, value):
  3791. # also raises Exception if object array with NA values
  3792. if com.is_bool_indexer(key):
  3793. # bool indexer is indexing along rows
  3794. if len(key) != len(self.index):
  3795. raise ValueError(
  3796. f"Item wrong length {len(key)} instead of {len(self.index)}!"
  3797. )
  3798. key = check_bool_indexer(self.index, key)
  3799. indexer = key.nonzero()[0]
  3800. self._check_setitem_copy()
  3801. if isinstance(value, DataFrame):
  3802. # GH#39931 reindex since iloc does not align
  3803. value = value.reindex(self.index.take(indexer))
  3804. self.iloc[indexer] = value
  3805. else:
  3806. # Note: unlike self.iloc[:, indexer] = value, this will
  3807. # never try to overwrite values inplace
  3808. if isinstance(value, DataFrame):
  3809. check_key_length(self.columns, key, value)
  3810. for k1, k2 in zip(key, value.columns):
  3811. self[k1] = value[k2]
  3812. elif not is_list_like(value):
  3813. for col in key:
  3814. self[col] = value
  3815. elif isinstance(value, np.ndarray) and value.ndim == 2:
  3816. self._iset_not_inplace(key, value)
  3817. elif np.ndim(value) > 1:
  3818. # list of lists
  3819. value = DataFrame(value).values
  3820. return self._setitem_array(key, value)
  3821. else:
  3822. self._iset_not_inplace(key, value)
  3823. def _iset_not_inplace(self, key, value):
  3824. # GH#39510 when setting with df[key] = obj with a list-like key and
  3825. # list-like value, we iterate over those listlikes and set columns
  3826. # one at a time. This is different from dispatching to
  3827. # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
  3828. # data inplace, whereas this will insert new arrays.
  3829. def igetitem(obj, i: int):
  3830. # Note: we catch DataFrame obj before getting here, but
  3831. # hypothetically would return obj.iloc[:, i]
  3832. if isinstance(obj, np.ndarray):
  3833. return obj[..., i]
  3834. else:
  3835. return obj[i]
  3836. if self.columns.is_unique:
  3837. if np.shape(value)[-1] != len(key):
  3838. raise ValueError("Columns must be same length as key")
  3839. for i, col in enumerate(key):
  3840. self[col] = igetitem(value, i)
  3841. else:
  3842. ilocs = self.columns.get_indexer_non_unique(key)[0]
  3843. if (ilocs < 0).any():
  3844. # key entries not in self.columns
  3845. raise NotImplementedError
  3846. if np.shape(value)[-1] != len(ilocs):
  3847. raise ValueError("Columns must be same length as key")
  3848. assert np.ndim(value) <= 2
  3849. orig_columns = self.columns
  3850. # Using self.iloc[:, i] = ... may set values inplace, which
  3851. # by convention we do not do in __setitem__
  3852. try:
  3853. self.columns = Index(range(len(self.columns)))
  3854. for i, iloc in enumerate(ilocs):
  3855. self[iloc] = igetitem(value, i)
  3856. finally:
  3857. self.columns = orig_columns
  3858. def _setitem_frame(self, key, value):
  3859. # support boolean setting with DataFrame input, e.g.
  3860. # df[df > df2] = 0
  3861. if isinstance(key, np.ndarray):
  3862. if key.shape != self.shape:
  3863. raise ValueError("Array conditional must be same shape as self")
  3864. key = self._constructor(key, **self._construct_axes_dict(), copy=False)
  3865. if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
  3866. raise TypeError(
  3867. "Must pass DataFrame or 2-d ndarray with boolean values only"
  3868. )
  3869. self._check_setitem_copy()
  3870. self._where(-key, value, inplace=True)
  3871. def _set_item_frame_value(self, key, value: DataFrame) -> None:
  3872. self._ensure_valid_index(value)
  3873. # align columns
  3874. if key in self.columns:
  3875. loc = self.columns.get_loc(key)
  3876. cols = self.columns[loc]
  3877. len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
  3878. if len_cols != len(value.columns):
  3879. raise ValueError("Columns must be same length as key")
  3880. # align right-hand-side columns if self.columns
  3881. # is multi-index and self[key] is a sub-frame
  3882. if isinstance(self.columns, MultiIndex) and isinstance(
  3883. loc, (slice, Series, np.ndarray, Index)
  3884. ):
  3885. cols_droplevel = maybe_droplevels(cols, key)
  3886. if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
  3887. value = value.reindex(cols_droplevel, axis=1)
  3888. for col, col_droplevel in zip(cols, cols_droplevel):
  3889. self[col] = value[col_droplevel]
  3890. return
  3891. if is_scalar(cols):
  3892. self[cols] = value[value.columns[0]]
  3893. return
  3894. locs: np.ndarray | list
  3895. if isinstance(loc, slice):
  3896. locs = np.arange(loc.start, loc.stop, loc.step)
  3897. elif is_scalar(loc):
  3898. locs = [loc]
  3899. else:
  3900. locs = loc.nonzero()[0]
  3901. return self.isetitem(locs, value)
  3902. if len(value.columns) > 1:
  3903. raise ValueError(
  3904. "Cannot set a DataFrame with multiple columns to the single "
  3905. f"column {key}"
  3906. )
  3907. elif len(value.columns) == 0:
  3908. raise ValueError(
  3909. f"Cannot set a DataFrame without columns to the column {key}"
  3910. )
  3911. self[key] = value[value.columns[0]]
  3912. def _iset_item_mgr(
  3913. self,
  3914. loc: int | slice | np.ndarray,
  3915. value,
  3916. inplace: bool = False,
  3917. refs: BlockValuesRefs | None = None,
  3918. ) -> None:
  3919. # when called from _set_item_mgr loc can be anything returned from get_loc
  3920. self._mgr.iset(loc, value, inplace=inplace, refs=refs)
  3921. self._clear_item_cache()
  3922. def _set_item_mgr(
  3923. self, key, value: ArrayLike, refs: BlockValuesRefs | None = None
  3924. ) -> None:
  3925. try:
  3926. loc = self._info_axis.get_loc(key)
  3927. except KeyError:
  3928. # This item wasn't present, just insert at end
  3929. self._mgr.insert(len(self._info_axis), key, value, refs)
  3930. else:
  3931. self._iset_item_mgr(loc, value, refs=refs)
  3932. # check if we are modifying a copy
  3933. # try to set first as we want an invalid
  3934. # value exception to occur first
  3935. if len(self):
  3936. self._check_setitem_copy()
  3937. def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None:
  3938. # We are only called from _replace_columnwise which guarantees that
  3939. # no reindex is necessary
  3940. if using_copy_on_write():
  3941. self._iset_item_mgr(
  3942. loc, value._values, inplace=inplace, refs=value._references
  3943. )
  3944. else:
  3945. self._iset_item_mgr(loc, value._values.copy(), inplace=True)
  3946. # check if we are modifying a copy
  3947. # try to set first as we want an invalid
  3948. # value exception to occur first
  3949. if len(self):
  3950. self._check_setitem_copy()
  3951. def _set_item(self, key, value) -> None:
  3952. """
  3953. Add series to DataFrame in specified column.
  3954. If series is a numpy-array (not a Series/TimeSeries), it must be the
  3955. same length as the DataFrames index or an error will be thrown.
  3956. Series/TimeSeries will be conformed to the DataFrames index to
  3957. ensure homogeneity.
  3958. """
  3959. value, refs = self._sanitize_column(value)
  3960. if (
  3961. key in self.columns
  3962. and value.ndim == 1
  3963. and not isinstance(value.dtype, ExtensionDtype)
  3964. ):
  3965. # broadcast across multiple columns if necessary
  3966. if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
  3967. existing_piece = self[key]
  3968. if isinstance(existing_piece, DataFrame):
  3969. value = np.tile(value, (len(existing_piece.columns), 1)).T
  3970. refs = None
  3971. self._set_item_mgr(key, value, refs)
  3972. def _set_value(
  3973. self, index: IndexLabel, col, value: Scalar, takeable: bool = False
  3974. ) -> None:
  3975. """
  3976. Put single value at passed column and index.
  3977. Parameters
  3978. ----------
  3979. index : Label
  3980. row label
  3981. col : Label
  3982. column label
  3983. value : scalar
  3984. takeable : bool, default False
  3985. Sets whether or not index/col interpreted as indexers
  3986. """
  3987. try:
  3988. if takeable:
  3989. icol = col
  3990. iindex = cast(int, index)
  3991. else:
  3992. icol = self.columns.get_loc(col)
  3993. iindex = self.index.get_loc(index)
  3994. self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
  3995. self._clear_item_cache()
  3996. except (KeyError, TypeError, ValueError, LossySetitemError):
  3997. # get_loc might raise a KeyError for missing labels (falling back
  3998. # to (i)loc will do expansion of the index)
  3999. # column_setitem will do validation that may raise TypeError,
  4000. # ValueError, or LossySetitemError
  4001. # set using a non-recursive method & reset the cache
  4002. if takeable:
  4003. self.iloc[index, col] = value
  4004. else:
  4005. self.loc[index, col] = value
  4006. self._item_cache.pop(col, None)
  4007. except InvalidIndexError as ii_err:
  4008. # GH48729: Seems like you are trying to assign a value to a
  4009. # row when only scalar options are permitted
  4010. raise InvalidIndexError(
  4011. f"You can only assign a scalar value not a {type(value)}"
  4012. ) from ii_err
  4013. def _ensure_valid_index(self, value) -> None:
  4014. """
  4015. Ensure that if we don't have an index, that we can create one from the
  4016. passed value.
  4017. """
  4018. # GH5632, make sure that we are a Series convertible
  4019. if not len(self.index) and is_list_like(value) and len(value):
  4020. if not isinstance(value, DataFrame):
  4021. try:
  4022. value = Series(value)
  4023. except (ValueError, NotImplementedError, TypeError) as err:
  4024. raise ValueError(
  4025. "Cannot set a frame with no defined index "
  4026. "and a value that cannot be converted to a Series"
  4027. ) from err
  4028. # GH31368 preserve name of index
  4029. index_copy = value.index.copy()
  4030. if self.index.name is not None:
  4031. index_copy.name = self.index.name
  4032. self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
  4033. def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
  4034. """
  4035. Provide boxed values for a column.
  4036. """
  4037. # Lookup in columns so that if e.g. a str datetime was passed
  4038. # we attach the Timestamp object as the name.
  4039. name = self.columns[loc]
  4040. # We get index=self.index bc values is a SingleDataManager
  4041. obj = self._constructor_sliced_from_mgr(values, axes=values.axes)
  4042. obj._name = name
  4043. return obj.__finalize__(self)
  4044. # ----------------------------------------------------------------------
  4045. # Lookup Caching
  4046. def _clear_item_cache(self) -> None:
  4047. self._item_cache.clear()
  4048. def _get_item_cache(self, item: Hashable) -> Series:
  4049. """Return the cached item, item represents a label indexer."""
  4050. if using_copy_on_write() or warn_copy_on_write():
  4051. loc = self.columns.get_loc(item)
  4052. return self._ixs(loc, axis=1)
  4053. cache = self._item_cache
  4054. res = cache.get(item)
  4055. if res is None:
  4056. # All places that call _get_item_cache have unique columns,
  4057. # pending resolution of GH#33047
  4058. loc = self.columns.get_loc(item)
  4059. res = self._ixs(loc, axis=1)
  4060. cache[item] = res
  4061. # for a chain
  4062. res._is_copy = self._is_copy
  4063. return res
  4064. def _reset_cacher(self) -> None:
  4065. # no-op for DataFrame
  4066. pass
  4067. def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
  4068. """
  4069. The object has called back to us saying maybe it has changed.
  4070. """
  4071. loc = self._info_axis.get_loc(item)
  4072. arraylike = value._values
  4073. old = self._ixs(loc, axis=1)
  4074. if old._values is value._values and inplace:
  4075. # GH#46149 avoid making unnecessary copies/block-splitting
  4076. return
  4077. self._mgr.iset(loc, arraylike, inplace=inplace)
  4078. # ----------------------------------------------------------------------
  4079. # Unsorted
  4080. @overload
  4081. def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
  4082. ...
  4083. @overload
  4084. def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
  4085. ...
  4086. @overload
  4087. def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
  4088. ...
  4089. def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
  4090. """
  4091. Query the columns of a DataFrame with a boolean expression.
  4092. Parameters
  4093. ----------
  4094. expr : str
  4095. The query string to evaluate.
  4096. You can refer to variables
  4097. in the environment by prefixing them with an '@' character like
  4098. ``@a + b``.
  4099. You can refer to column names that are not valid Python variable names
  4100. by surrounding them in backticks. Thus, column names containing spaces
  4101. or punctuations (besides underscores) or starting with digits must be
  4102. surrounded by backticks. (For example, a column named "Area (cm^2)" would
  4103. be referenced as ```Area (cm^2)```). Column names which are Python keywords
  4104. (like "list", "for", "import", etc) cannot be used.
  4105. For example, if one of your columns is called ``a a`` and you want
  4106. to sum it with ``b``, your query should be ```a a` + b``.
  4107. inplace : bool
  4108. Whether to modify the DataFrame rather than creating a new one.
  4109. **kwargs
  4110. See the documentation for :func:`eval` for complete details
  4111. on the keyword arguments accepted by :meth:`DataFrame.query`.
  4112. Returns
  4113. -------
  4114. DataFrame or None
  4115. DataFrame resulting from the provided query expression or
  4116. None if ``inplace=True``.
  4117. See Also
  4118. --------
  4119. eval : Evaluate a string describing operations on
  4120. DataFrame columns.
  4121. DataFrame.eval : Evaluate a string describing operations on
  4122. DataFrame columns.
  4123. Notes
  4124. -----
  4125. The result of the evaluation of this expression is first passed to
  4126. :attr:`DataFrame.loc` and if that fails because of a
  4127. multidimensional key (e.g., a DataFrame) then the result will be passed
  4128. to :meth:`DataFrame.__getitem__`.
  4129. This method uses the top-level :func:`eval` function to
  4130. evaluate the passed query.
  4131. The :meth:`~pandas.DataFrame.query` method uses a slightly
  4132. modified Python syntax by default. For example, the ``&`` and ``|``
  4133. (bitwise) operators have the precedence of their boolean cousins,
  4134. :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
  4135. however the semantics are different.
  4136. You can change the semantics of the expression by passing the keyword
  4137. argument ``parser='python'``. This enforces the same semantics as
  4138. evaluation in Python space. Likewise, you can pass ``engine='python'``
  4139. to evaluate an expression using Python itself as a backend. This is not
  4140. recommended as it is inefficient compared to using ``numexpr`` as the
  4141. engine.
  4142. The :attr:`DataFrame.index` and
  4143. :attr:`DataFrame.columns` attributes of the
  4144. :class:`~pandas.DataFrame` instance are placed in the query namespace
  4145. by default, which allows you to treat both the index and columns of the
  4146. frame as a column in the frame.
  4147. The identifier ``index`` is used for the frame index; you can also
  4148. use the name of the index to identify it in a query. Please note that
  4149. Python keywords may not be used as identifiers.
  4150. For further details and examples see the ``query`` documentation in
  4151. :ref:`indexing <indexing.query>`.
  4152. *Backtick quoted variables*
  4153. Backtick quoted variables are parsed as literal Python code and
  4154. are converted internally to a Python valid identifier.
  4155. This can lead to the following problems.
  4156. During parsing a number of disallowed characters inside the backtick
  4157. quoted string are replaced by strings that are allowed as a Python identifier.
  4158. These characters include all operators in Python, the space character, the
  4159. question mark, the exclamation mark, the dollar sign, and the euro sign.
  4160. For other characters that fall outside the ASCII range (U+0001..U+007F)
  4161. and those that are not further specified in PEP 3131,
  4162. the query parser will raise an error.
  4163. This excludes whitespace different than the space character,
  4164. but also the hashtag (as it is used for comments) and the backtick
  4165. itself (backtick can also not be escaped).
  4166. In a special case, quotes that make a pair around a backtick can
  4167. confuse the parser.
  4168. For example, ```it's` > `that's``` will raise an error,
  4169. as it forms a quoted string (``'s > `that'``) with a backtick inside.
  4170. See also the Python documentation about lexical analysis
  4171. (https://docs.python.org/3/reference/lexical_analysis.html)
  4172. in combination with the source code in :mod:`pandas.core.computation.parsing`.
  4173. Examples
  4174. --------
  4175. >>> df = pd.DataFrame({'A': range(1, 6),
  4176. ... 'B': range(10, 0, -2),
  4177. ... 'C C': range(10, 5, -1)})
  4178. >>> df
  4179. A B C C
  4180. 0 1 10 10
  4181. 1 2 8 9
  4182. 2 3 6 8
  4183. 3 4 4 7
  4184. 4 5 2 6
  4185. >>> df.query('A > B')
  4186. A B C C
  4187. 4 5 2 6
  4188. The previous expression is equivalent to
  4189. >>> df[df.A > df.B]
  4190. A B C C
  4191. 4 5 2 6
  4192. For columns with spaces in their name, you can use backtick quoting.
  4193. >>> df.query('B == `C C`')
  4194. A B C C
  4195. 0 1 10 10
  4196. The previous expression is equivalent to
  4197. >>> df[df.B == df['C C']]
  4198. A B C C
  4199. 0 1 10 10
  4200. """
  4201. inplace = validate_bool_kwarg(inplace, "inplace")
  4202. if not isinstance(expr, str):
  4203. msg = f"expr must be a string to be evaluated, {type(expr)} given"
  4204. raise ValueError(msg)
  4205. kwargs["level"] = kwargs.pop("level", 0) + 1
  4206. kwargs["target"] = None
  4207. res = self.eval(expr, **kwargs)
  4208. try:
  4209. result = self.loc[res]
  4210. except ValueError:
  4211. # when res is multi-dimensional loc raises, but this is sometimes a
  4212. # valid query
  4213. result = self[res]
  4214. if inplace:
  4215. self._update_inplace(result)
  4216. return None
  4217. else:
  4218. return result
  4219. @overload
  4220. def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
  4221. ...
  4222. @overload
  4223. def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
  4224. ...
  4225. def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
  4226. """
  4227. Evaluate a string describing operations on DataFrame columns.
  4228. Operates on columns only, not specific rows or elements. This allows
  4229. `eval` to run arbitrary code, which can make you vulnerable to code
  4230. injection if you pass user input to this function.
  4231. Parameters
  4232. ----------
  4233. expr : str
  4234. The expression string to evaluate.
  4235. inplace : bool, default False
  4236. If the expression contains an assignment, whether to perform the
  4237. operation inplace and mutate the existing DataFrame. Otherwise,
  4238. a new DataFrame is returned.
  4239. **kwargs
  4240. See the documentation for :func:`eval` for complete details
  4241. on the keyword arguments accepted by
  4242. :meth:`~pandas.DataFrame.query`.
  4243. Returns
  4244. -------
  4245. ndarray, scalar, pandas object, or None
  4246. The result of the evaluation or None if ``inplace=True``.
  4247. See Also
  4248. --------
  4249. DataFrame.query : Evaluates a boolean expression to query the columns
  4250. of a frame.
  4251. DataFrame.assign : Can evaluate an expression or function to create new
  4252. values for a column.
  4253. eval : Evaluate a Python expression as a string using various
  4254. backends.
  4255. Notes
  4256. -----
  4257. For more details see the API documentation for :func:`~eval`.
  4258. For detailed examples see :ref:`enhancing performance with eval
  4259. <enhancingperf.eval>`.
  4260. Examples
  4261. --------
  4262. >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
  4263. >>> df
  4264. A B
  4265. 0 1 10
  4266. 1 2 8
  4267. 2 3 6
  4268. 3 4 4
  4269. 4 5 2
  4270. >>> df.eval('A + B')
  4271. 0 11
  4272. 1 10
  4273. 2 9
  4274. 3 8
  4275. 4 7
  4276. dtype: int64
  4277. Assignment is allowed though by default the original DataFrame is not
  4278. modified.
  4279. >>> df.eval('C = A + B')
  4280. A B C
  4281. 0 1 10 11
  4282. 1 2 8 10
  4283. 2 3 6 9
  4284. 3 4 4 8
  4285. 4 5 2 7
  4286. >>> df
  4287. A B
  4288. 0 1 10
  4289. 1 2 8
  4290. 2 3 6
  4291. 3 4 4
  4292. 4 5 2
  4293. Multiple columns can be assigned to using multi-line expressions:
  4294. >>> df.eval(
  4295. ... '''
  4296. ... C = A + B
  4297. ... D = A - B
  4298. ... '''
  4299. ... )
  4300. A B C D
  4301. 0 1 10 11 -9
  4302. 1 2 8 10 -6
  4303. 2 3 6 9 -3
  4304. 3 4 4 8 0
  4305. 4 5 2 7 3
  4306. """
  4307. from pandas.core.computation.eval import eval as _eval
  4308. inplace = validate_bool_kwarg(inplace, "inplace")
  4309. kwargs["level"] = kwargs.pop("level", 0) + 1
  4310. index_resolvers = self._get_index_resolvers()
  4311. column_resolvers = self._get_cleaned_column_resolvers()
  4312. resolvers = column_resolvers, index_resolvers
  4313. if "target" not in kwargs:
  4314. kwargs["target"] = self
  4315. kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
  4316. return _eval(expr, inplace=inplace, **kwargs)
  4317. def select_dtypes(self, include=None, exclude=None) -> Self:
  4318. """
  4319. Return a subset of the DataFrame's columns based on the column dtypes.
  4320. Parameters
  4321. ----------
  4322. include, exclude : scalar or list-like
  4323. A selection of dtypes or strings to be included/excluded. At least
  4324. one of these parameters must be supplied.
  4325. Returns
  4326. -------
  4327. DataFrame
  4328. The subset of the frame including the dtypes in ``include`` and
  4329. excluding the dtypes in ``exclude``.
  4330. Raises
  4331. ------
  4332. ValueError
  4333. * If both of ``include`` and ``exclude`` are empty
  4334. * If ``include`` and ``exclude`` have overlapping elements
  4335. * If any kind of string dtype is passed in.
  4336. See Also
  4337. --------
  4338. DataFrame.dtypes: Return Series with the data type of each column.
  4339. Notes
  4340. -----
  4341. * To select all *numeric* types, use ``np.number`` or ``'number'``
  4342. * To select strings you must use the ``object`` dtype, but note that
  4343. this will return *all* object dtype columns. With
  4344. ``pd.options.future.infer_string`` enabled, using ``"str"`` will
  4345. work to select all string columns.
  4346. * See the `numpy dtype hierarchy
  4347. <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
  4348. * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
  4349. ``'datetime64'``
  4350. * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
  4351. ``'timedelta64'``
  4352. * To select Pandas categorical dtypes, use ``'category'``
  4353. * To select Pandas datetimetz dtypes, use ``'datetimetz'``
  4354. or ``'datetime64[ns, tz]'``
  4355. Examples
  4356. --------
  4357. >>> df = pd.DataFrame({'a': [1, 2] * 3,
  4358. ... 'b': [True, False] * 3,
  4359. ... 'c': [1.0, 2.0] * 3})
  4360. >>> df
  4361. a b c
  4362. 0 1 True 1.0
  4363. 1 2 False 2.0
  4364. 2 1 True 1.0
  4365. 3 2 False 2.0
  4366. 4 1 True 1.0
  4367. 5 2 False 2.0
  4368. >>> df.select_dtypes(include='bool')
  4369. b
  4370. 0 True
  4371. 1 False
  4372. 2 True
  4373. 3 False
  4374. 4 True
  4375. 5 False
  4376. >>> df.select_dtypes(include=['float64'])
  4377. c
  4378. 0 1.0
  4379. 1 2.0
  4380. 2 1.0
  4381. 3 2.0
  4382. 4 1.0
  4383. 5 2.0
  4384. >>> df.select_dtypes(exclude=['int64'])
  4385. b c
  4386. 0 True 1.0
  4387. 1 False 2.0
  4388. 2 True 1.0
  4389. 3 False 2.0
  4390. 4 True 1.0
  4391. 5 False 2.0
  4392. """
  4393. if not is_list_like(include):
  4394. include = (include,) if include is not None else ()
  4395. if not is_list_like(exclude):
  4396. exclude = (exclude,) if exclude is not None else ()
  4397. selection = (frozenset(include), frozenset(exclude))
  4398. if not any(selection):
  4399. raise ValueError("at least one of include or exclude must be nonempty")
  4400. # convert the myriad valid dtypes object to a single representation
  4401. def check_int_infer_dtype(dtypes):
  4402. converted_dtypes: list[type] = []
  4403. for dtype in dtypes:
  4404. # Numpy maps int to different types (int32, in64) on Windows and Linux
  4405. # see https://github.com/numpy/numpy/issues/9464
  4406. if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
  4407. converted_dtypes.append(np.int32)
  4408. converted_dtypes.append(np.int64)
  4409. elif dtype == "float" or dtype is float:
  4410. # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
  4411. converted_dtypes.extend([np.float64, np.float32])
  4412. else:
  4413. converted_dtypes.append(infer_dtype_from_object(dtype))
  4414. return frozenset(converted_dtypes)
  4415. include = check_int_infer_dtype(include)
  4416. exclude = check_int_infer_dtype(exclude)
  4417. for dtypes in (include, exclude):
  4418. invalidate_string_dtypes(dtypes)
  4419. # can't both include AND exclude!
  4420. if not include.isdisjoint(exclude):
  4421. raise ValueError(f"include and exclude overlap on {(include & exclude)}")
  4422. def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
  4423. # GH 46870: BooleanDtype._is_numeric == True but should be excluded
  4424. dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
  4425. return (
  4426. issubclass(dtype.type, tuple(dtypes_set))
  4427. or (
  4428. np.number in dtypes_set
  4429. and getattr(dtype, "_is_numeric", False)
  4430. and not is_bool_dtype(dtype)
  4431. )
  4432. # backwards compat for the default `str` dtype being selected by object
  4433. or (
  4434. isinstance(dtype, StringDtype)
  4435. and dtype.na_value is np.nan
  4436. and np.object_ in dtypes_set
  4437. )
  4438. )
  4439. def predicate(arr: ArrayLike) -> bool:
  4440. dtype = arr.dtype
  4441. if include:
  4442. if not dtype_predicate(dtype, include):
  4443. return False
  4444. if exclude:
  4445. if dtype_predicate(dtype, exclude):
  4446. return False
  4447. return True
  4448. mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
  4449. # error: Incompatible return value type (got "DataFrame", expected "Self")
  4450. return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value]
  4451. def insert(
  4452. self,
  4453. loc: int,
  4454. column: Hashable,
  4455. value: Scalar | AnyArrayLike,
  4456. allow_duplicates: bool | lib.NoDefault = lib.no_default,
  4457. ) -> None:
  4458. """
  4459. Insert column into DataFrame at specified location.
  4460. Raises a ValueError if `column` is already contained in the DataFrame,
  4461. unless `allow_duplicates` is set to True.
  4462. Parameters
  4463. ----------
  4464. loc : int
  4465. Insertion index. Must verify 0 <= loc <= len(columns).
  4466. column : str, number, or hashable object
  4467. Label of the inserted column.
  4468. value : Scalar, Series, or array-like
  4469. Content of the inserted column.
  4470. allow_duplicates : bool, optional, default lib.no_default
  4471. Allow duplicate column labels to be created.
  4472. See Also
  4473. --------
  4474. Index.insert : Insert new item by index.
  4475. Examples
  4476. --------
  4477. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  4478. >>> df
  4479. col1 col2
  4480. 0 1 3
  4481. 1 2 4
  4482. >>> df.insert(1, "newcol", [99, 99])
  4483. >>> df
  4484. col1 newcol col2
  4485. 0 1 99 3
  4486. 1 2 99 4
  4487. >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
  4488. >>> df
  4489. col1 col1 newcol col2
  4490. 0 100 1 99 3
  4491. 1 100 2 99 4
  4492. Notice that pandas uses index alignment in case of `value` from type `Series`:
  4493. >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
  4494. >>> df
  4495. col0 col1 col1 newcol col2
  4496. 0 NaN 100 1 99 3
  4497. 1 5.0 100 2 99 4
  4498. """
  4499. if allow_duplicates is lib.no_default:
  4500. allow_duplicates = False
  4501. if allow_duplicates and not self.flags.allows_duplicate_labels:
  4502. raise ValueError(
  4503. "Cannot specify 'allow_duplicates=True' when "
  4504. "'self.flags.allows_duplicate_labels' is False."
  4505. )
  4506. if not allow_duplicates and column in self.columns:
  4507. # Should this be a different kind of error??
  4508. raise ValueError(f"cannot insert {column}, already exists")
  4509. if not is_integer(loc):
  4510. raise TypeError("loc must be int")
  4511. # convert non stdlib ints to satisfy typing checks
  4512. loc = int(loc)
  4513. if isinstance(value, DataFrame) and len(value.columns) > 1:
  4514. raise ValueError(
  4515. f"Expected a one-dimensional object, got a DataFrame with "
  4516. f"{len(value.columns)} columns instead."
  4517. )
  4518. elif isinstance(value, DataFrame):
  4519. value = value.iloc[:, 0]
  4520. value, refs = self._sanitize_column(value)
  4521. self._mgr.insert(loc, column, value, refs=refs)
  4522. def assign(self, **kwargs) -> DataFrame:
  4523. r"""
  4524. Assign new columns to a DataFrame.
  4525. Returns a new object with all original columns in addition to new ones.
  4526. Existing columns that are re-assigned will be overwritten.
  4527. Parameters
  4528. ----------
  4529. **kwargs : dict of {str: callable or Series}
  4530. The column names are keywords. If the values are
  4531. callable, they are computed on the DataFrame and
  4532. assigned to the new columns. The callable must not
  4533. change input DataFrame (though pandas doesn't check it).
  4534. If the values are not callable, (e.g. a Series, scalar, or array),
  4535. they are simply assigned.
  4536. Returns
  4537. -------
  4538. DataFrame
  4539. A new DataFrame with the new columns in addition to
  4540. all the existing columns.
  4541. Notes
  4542. -----
  4543. Assigning multiple columns within the same ``assign`` is possible.
  4544. Later items in '\*\*kwargs' may refer to newly created or modified
  4545. columns in 'df'; items are computed and assigned into 'df' in order.
  4546. Examples
  4547. --------
  4548. >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
  4549. ... index=['Portland', 'Berkeley'])
  4550. >>> df
  4551. temp_c
  4552. Portland 17.0
  4553. Berkeley 25.0
  4554. Where the value is a callable, evaluated on `df`:
  4555. >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
  4556. temp_c temp_f
  4557. Portland 17.0 62.6
  4558. Berkeley 25.0 77.0
  4559. Alternatively, the same behavior can be achieved by directly
  4560. referencing an existing Series or sequence:
  4561. >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
  4562. temp_c temp_f
  4563. Portland 17.0 62.6
  4564. Berkeley 25.0 77.0
  4565. You can create multiple columns within the same assign where one
  4566. of the columns depends on another one defined within the same assign:
  4567. >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
  4568. ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
  4569. temp_c temp_f temp_k
  4570. Portland 17.0 62.6 290.15
  4571. Berkeley 25.0 77.0 298.15
  4572. """
  4573. data = self.copy(deep=None)
  4574. for k, v in kwargs.items():
  4575. data[k] = com.apply_if_callable(v, data)
  4576. return data
  4577. def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]:
  4578. """
  4579. Ensures new columns (which go into the BlockManager as new blocks) are
  4580. always copied (or a reference is being tracked to them under CoW)
  4581. and converted into an array.
  4582. Parameters
  4583. ----------
  4584. value : scalar, Series, or array-like
  4585. Returns
  4586. -------
  4587. tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs
  4588. """
  4589. self._ensure_valid_index(value)
  4590. # Using a DataFrame would mean coercing values to one dtype
  4591. assert not isinstance(value, DataFrame)
  4592. if is_dict_like(value):
  4593. if not isinstance(value, Series):
  4594. value = Series(value)
  4595. return _reindex_for_setitem(value, self.index)
  4596. if is_list_like(value):
  4597. com.require_length_match(value, self.index)
  4598. arr = sanitize_array(value, self.index, copy=True, allow_2d=True)
  4599. if (
  4600. isinstance(value, Index)
  4601. and value.dtype == "object"
  4602. and arr.dtype != value.dtype
  4603. ): #
  4604. # TODO: Remove kludge in sanitize_array for string mode when enforcing
  4605. # this deprecation
  4606. warnings.warn(
  4607. "Setting an Index with object dtype into a DataFrame will stop "
  4608. "inferring another dtype in a future version. Cast the Index "
  4609. "explicitly before setting it into the DataFrame.",
  4610. FutureWarning,
  4611. stacklevel=find_stack_level(),
  4612. )
  4613. return arr, None
  4614. @property
  4615. def _series(self):
  4616. return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)}
  4617. # ----------------------------------------------------------------------
  4618. # Reindexing and alignment
  4619. def _reindex_multi(
  4620. self, axes: dict[str, Index], copy: bool, fill_value
  4621. ) -> DataFrame:
  4622. """
  4623. We are guaranteed non-Nones in the axes.
  4624. """
  4625. new_index, row_indexer = self.index.reindex(axes["index"])
  4626. new_columns, col_indexer = self.columns.reindex(axes["columns"])
  4627. if row_indexer is not None and col_indexer is not None:
  4628. # Fastpath. By doing two 'take's at once we avoid making an
  4629. # unnecessary copy.
  4630. # We only get here with `self._can_fast_transpose`, which (almost)
  4631. # ensures that self.values is cheap. It may be worth making this
  4632. # condition more specific.
  4633. indexer = row_indexer, col_indexer
  4634. new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
  4635. return self._constructor(
  4636. new_values, index=new_index, columns=new_columns, copy=False
  4637. )
  4638. else:
  4639. return self._reindex_with_indexers(
  4640. {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
  4641. copy=copy,
  4642. fill_value=fill_value,
  4643. )
  4644. @Appender(
  4645. """
  4646. Examples
  4647. --------
  4648. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  4649. Change the row labels.
  4650. >>> df.set_axis(['a', 'b', 'c'], axis='index')
  4651. A B
  4652. a 1 4
  4653. b 2 5
  4654. c 3 6
  4655. Change the column labels.
  4656. >>> df.set_axis(['I', 'II'], axis='columns')
  4657. I II
  4658. 0 1 4
  4659. 1 2 5
  4660. 2 3 6
  4661. """
  4662. )
  4663. @Substitution(
  4664. klass=_shared_doc_kwargs["klass"],
  4665. axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
  4666. extended_summary_sub=" column or",
  4667. axis_description_sub=", and 1 identifies the columns",
  4668. see_also_sub=" or columns",
  4669. )
  4670. @Appender(NDFrame.set_axis.__doc__)
  4671. def set_axis(
  4672. self,
  4673. labels,
  4674. *,
  4675. axis: Axis = 0,
  4676. copy: bool | None = None,
  4677. ) -> DataFrame:
  4678. return super().set_axis(labels, axis=axis, copy=copy)
  4679. @doc(
  4680. NDFrame.reindex,
  4681. klass=_shared_doc_kwargs["klass"],
  4682. optional_reindex=_shared_doc_kwargs["optional_reindex"],
  4683. )
  4684. def reindex(
  4685. self,
  4686. labels=None,
  4687. *,
  4688. index=None,
  4689. columns=None,
  4690. axis: Axis | None = None,
  4691. method: ReindexMethod | None = None,
  4692. copy: bool | None = None,
  4693. level: Level | None = None,
  4694. fill_value: Scalar | None = np.nan,
  4695. limit: int | None = None,
  4696. tolerance=None,
  4697. ) -> DataFrame:
  4698. return super().reindex(
  4699. labels=labels,
  4700. index=index,
  4701. columns=columns,
  4702. axis=axis,
  4703. method=method,
  4704. copy=copy,
  4705. level=level,
  4706. fill_value=fill_value,
  4707. limit=limit,
  4708. tolerance=tolerance,
  4709. )
  4710. @overload
  4711. def drop(
  4712. self,
  4713. labels: IndexLabel = ...,
  4714. *,
  4715. axis: Axis = ...,
  4716. index: IndexLabel = ...,
  4717. columns: IndexLabel = ...,
  4718. level: Level = ...,
  4719. inplace: Literal[True],
  4720. errors: IgnoreRaise = ...,
  4721. ) -> None:
  4722. ...
  4723. @overload
  4724. def drop(
  4725. self,
  4726. labels: IndexLabel = ...,
  4727. *,
  4728. axis: Axis = ...,
  4729. index: IndexLabel = ...,
  4730. columns: IndexLabel = ...,
  4731. level: Level = ...,
  4732. inplace: Literal[False] = ...,
  4733. errors: IgnoreRaise = ...,
  4734. ) -> DataFrame:
  4735. ...
  4736. @overload
  4737. def drop(
  4738. self,
  4739. labels: IndexLabel = ...,
  4740. *,
  4741. axis: Axis = ...,
  4742. index: IndexLabel = ...,
  4743. columns: IndexLabel = ...,
  4744. level: Level = ...,
  4745. inplace: bool = ...,
  4746. errors: IgnoreRaise = ...,
  4747. ) -> DataFrame | None:
  4748. ...
  4749. def drop(
  4750. self,
  4751. labels: IndexLabel | None = None,
  4752. *,
  4753. axis: Axis = 0,
  4754. index: IndexLabel | None = None,
  4755. columns: IndexLabel | None = None,
  4756. level: Level | None = None,
  4757. inplace: bool = False,
  4758. errors: IgnoreRaise = "raise",
  4759. ) -> DataFrame | None:
  4760. """
  4761. Drop specified labels from rows or columns.
  4762. Remove rows or columns by specifying label names and corresponding
  4763. axis, or by directly specifying index or column names. When using a
  4764. multi-index, labels on different levels can be removed by specifying
  4765. the level. See the :ref:`user guide <advanced.shown_levels>`
  4766. for more information about the now unused levels.
  4767. Parameters
  4768. ----------
  4769. labels : single label or list-like
  4770. Index or column labels to drop. A tuple will be used as a single
  4771. label and not treated as a list-like.
  4772. axis : {0 or 'index', 1 or 'columns'}, default 0
  4773. Whether to drop labels from the index (0 or 'index') or
  4774. columns (1 or 'columns').
  4775. index : single label or list-like
  4776. Alternative to specifying axis (``labels, axis=0``
  4777. is equivalent to ``index=labels``).
  4778. columns : single label or list-like
  4779. Alternative to specifying axis (``labels, axis=1``
  4780. is equivalent to ``columns=labels``).
  4781. level : int or level name, optional
  4782. For MultiIndex, level from which the labels will be removed.
  4783. inplace : bool, default False
  4784. If False, return a copy. Otherwise, do operation
  4785. in place and return None.
  4786. errors : {'ignore', 'raise'}, default 'raise'
  4787. If 'ignore', suppress error and only existing labels are
  4788. dropped.
  4789. Returns
  4790. -------
  4791. DataFrame or None
  4792. Returns DataFrame or None DataFrame with the specified
  4793. index or column labels removed or None if inplace=True.
  4794. Raises
  4795. ------
  4796. KeyError
  4797. If any of the labels is not found in the selected axis.
  4798. See Also
  4799. --------
  4800. DataFrame.loc : Label-location based indexer for selection by label.
  4801. DataFrame.dropna : Return DataFrame with labels on given axis omitted
  4802. where (all or any) data are missing.
  4803. DataFrame.drop_duplicates : Return DataFrame with duplicate rows
  4804. removed, optionally only considering certain columns.
  4805. Series.drop : Return Series with specified index labels removed.
  4806. Examples
  4807. --------
  4808. >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
  4809. ... columns=['A', 'B', 'C', 'D'])
  4810. >>> df
  4811. A B C D
  4812. 0 0 1 2 3
  4813. 1 4 5 6 7
  4814. 2 8 9 10 11
  4815. Drop columns
  4816. >>> df.drop(['B', 'C'], axis=1)
  4817. A D
  4818. 0 0 3
  4819. 1 4 7
  4820. 2 8 11
  4821. >>> df.drop(columns=['B', 'C'])
  4822. A D
  4823. 0 0 3
  4824. 1 4 7
  4825. 2 8 11
  4826. Drop a row by index
  4827. >>> df.drop([0, 1])
  4828. A B C D
  4829. 2 8 9 10 11
  4830. Drop columns and/or rows of MultiIndex DataFrame
  4831. >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'],
  4832. ... ['speed', 'weight', 'length']],
  4833. ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
  4834. ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
  4835. >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
  4836. ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
  4837. ... [250, 150], [1.5, 0.8], [320, 250],
  4838. ... [1, 0.8], [0.3, 0.2]])
  4839. >>> df
  4840. big small
  4841. llama speed 45.0 30.0
  4842. weight 200.0 100.0
  4843. length 1.5 1.0
  4844. cow speed 30.0 20.0
  4845. weight 250.0 150.0
  4846. length 1.5 0.8
  4847. falcon speed 320.0 250.0
  4848. weight 1.0 0.8
  4849. length 0.3 0.2
  4850. Drop a specific index combination from the MultiIndex
  4851. DataFrame, i.e., drop the combination ``'falcon'`` and
  4852. ``'weight'``, which deletes only the corresponding row
  4853. >>> df.drop(index=('falcon', 'weight'))
  4854. big small
  4855. llama speed 45.0 30.0
  4856. weight 200.0 100.0
  4857. length 1.5 1.0
  4858. cow speed 30.0 20.0
  4859. weight 250.0 150.0
  4860. length 1.5 0.8
  4861. falcon speed 320.0 250.0
  4862. length 0.3 0.2
  4863. >>> df.drop(index='cow', columns='small')
  4864. big
  4865. llama speed 45.0
  4866. weight 200.0
  4867. length 1.5
  4868. falcon speed 320.0
  4869. weight 1.0
  4870. length 0.3
  4871. >>> df.drop(index='length', level=1)
  4872. big small
  4873. llama speed 45.0 30.0
  4874. weight 200.0 100.0
  4875. cow speed 30.0 20.0
  4876. weight 250.0 150.0
  4877. falcon speed 320.0 250.0
  4878. weight 1.0 0.8
  4879. """
  4880. return super().drop(
  4881. labels=labels,
  4882. axis=axis,
  4883. index=index,
  4884. columns=columns,
  4885. level=level,
  4886. inplace=inplace,
  4887. errors=errors,
  4888. )
  4889. @overload
  4890. def rename(
  4891. self,
  4892. mapper: Renamer | None = ...,
  4893. *,
  4894. index: Renamer | None = ...,
  4895. columns: Renamer | None = ...,
  4896. axis: Axis | None = ...,
  4897. copy: bool | None = ...,
  4898. inplace: Literal[True],
  4899. level: Level = ...,
  4900. errors: IgnoreRaise = ...,
  4901. ) -> None:
  4902. ...
  4903. @overload
  4904. def rename(
  4905. self,
  4906. mapper: Renamer | None = ...,
  4907. *,
  4908. index: Renamer | None = ...,
  4909. columns: Renamer | None = ...,
  4910. axis: Axis | None = ...,
  4911. copy: bool | None = ...,
  4912. inplace: Literal[False] = ...,
  4913. level: Level = ...,
  4914. errors: IgnoreRaise = ...,
  4915. ) -> DataFrame:
  4916. ...
  4917. @overload
  4918. def rename(
  4919. self,
  4920. mapper: Renamer | None = ...,
  4921. *,
  4922. index: Renamer | None = ...,
  4923. columns: Renamer | None = ...,
  4924. axis: Axis | None = ...,
  4925. copy: bool | None = ...,
  4926. inplace: bool = ...,
  4927. level: Level = ...,
  4928. errors: IgnoreRaise = ...,
  4929. ) -> DataFrame | None:
  4930. ...
  4931. def rename(
  4932. self,
  4933. mapper: Renamer | None = None,
  4934. *,
  4935. index: Renamer | None = None,
  4936. columns: Renamer | None = None,
  4937. axis: Axis | None = None,
  4938. copy: bool | None = None,
  4939. inplace: bool = False,
  4940. level: Level | None = None,
  4941. errors: IgnoreRaise = "ignore",
  4942. ) -> DataFrame | None:
  4943. """
  4944. Rename columns or index labels.
  4945. Function / dict values must be unique (1-to-1). Labels not contained in
  4946. a dict / Series will be left as-is. Extra labels listed don't throw an
  4947. error.
  4948. See the :ref:`user guide <basics.rename>` for more.
  4949. Parameters
  4950. ----------
  4951. mapper : dict-like or function
  4952. Dict-like or function transformations to apply to
  4953. that axis' values. Use either ``mapper`` and ``axis`` to
  4954. specify the axis to target with ``mapper``, or ``index`` and
  4955. ``columns``.
  4956. index : dict-like or function
  4957. Alternative to specifying axis (``mapper, axis=0``
  4958. is equivalent to ``index=mapper``).
  4959. columns : dict-like or function
  4960. Alternative to specifying axis (``mapper, axis=1``
  4961. is equivalent to ``columns=mapper``).
  4962. axis : {0 or 'index', 1 or 'columns'}, default 0
  4963. Axis to target with ``mapper``. Can be either the axis name
  4964. ('index', 'columns') or number (0, 1). The default is 'index'.
  4965. copy : bool, default True
  4966. Also copy underlying data.
  4967. .. note::
  4968. The `copy` keyword will change behavior in pandas 3.0.
  4969. `Copy-on-Write
  4970. <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
  4971. will be enabled by default, which means that all methods with a
  4972. `copy` keyword will use a lazy copy mechanism to defer the copy and
  4973. ignore the `copy` keyword. The `copy` keyword will be removed in a
  4974. future version of pandas.
  4975. You can already get the future behavior and improvements through
  4976. enabling copy on write ``pd.options.mode.copy_on_write = True``
  4977. inplace : bool, default False
  4978. Whether to modify the DataFrame rather than creating a new one.
  4979. If True then value of copy is ignored.
  4980. level : int or level name, default None
  4981. In case of a MultiIndex, only rename labels in the specified
  4982. level.
  4983. errors : {'ignore', 'raise'}, default 'ignore'
  4984. If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
  4985. or `columns` contains labels that are not present in the Index
  4986. being transformed.
  4987. If 'ignore', existing keys will be renamed and extra keys will be
  4988. ignored.
  4989. Returns
  4990. -------
  4991. DataFrame or None
  4992. DataFrame with the renamed axis labels or None if ``inplace=True``.
  4993. Raises
  4994. ------
  4995. KeyError
  4996. If any of the labels is not found in the selected axis and
  4997. "errors='raise'".
  4998. See Also
  4999. --------
  5000. DataFrame.rename_axis : Set the name of the axis.
  5001. Examples
  5002. --------
  5003. ``DataFrame.rename`` supports two calling conventions
  5004. * ``(index=index_mapper, columns=columns_mapper, ...)``
  5005. * ``(mapper, axis={'index', 'columns'}, ...)``
  5006. We *highly* recommend using keyword arguments to clarify your
  5007. intent.
  5008. Rename columns using a mapping:
  5009. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  5010. >>> df.rename(columns={"A": "a", "B": "c"})
  5011. a c
  5012. 0 1 4
  5013. 1 2 5
  5014. 2 3 6
  5015. Rename index using a mapping:
  5016. >>> df.rename(index={0: "x", 1: "y", 2: "z"})
  5017. A B
  5018. x 1 4
  5019. y 2 5
  5020. z 3 6
  5021. Cast index labels to a different type:
  5022. >>> df.index
  5023. RangeIndex(start=0, stop=3, step=1)
  5024. >>> df.rename(index=str).index
  5025. Index(['0', '1', '2'], dtype='object')
  5026. >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
  5027. Traceback (most recent call last):
  5028. KeyError: ['C'] not found in axis
  5029. Using axis-style parameters:
  5030. >>> df.rename(str.lower, axis='columns')
  5031. a b
  5032. 0 1 4
  5033. 1 2 5
  5034. 2 3 6
  5035. >>> df.rename({1: 2, 2: 4}, axis='index')
  5036. A B
  5037. 0 1 4
  5038. 2 2 5
  5039. 4 3 6
  5040. """
  5041. return super()._rename(
  5042. mapper=mapper,
  5043. index=index,
  5044. columns=columns,
  5045. axis=axis,
  5046. copy=copy,
  5047. inplace=inplace,
  5048. level=level,
  5049. errors=errors,
  5050. )
  5051. def pop(self, item: Hashable) -> Series:
  5052. """
  5053. Return item and drop from frame. Raise KeyError if not found.
  5054. Parameters
  5055. ----------
  5056. item : label
  5057. Label of column to be popped.
  5058. Returns
  5059. -------
  5060. Series
  5061. Examples
  5062. --------
  5063. >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
  5064. ... ('parrot', 'bird', 24.0),
  5065. ... ('lion', 'mammal', 80.5),
  5066. ... ('monkey', 'mammal', np.nan)],
  5067. ... columns=('name', 'class', 'max_speed'))
  5068. >>> df
  5069. name class max_speed
  5070. 0 falcon bird 389.0
  5071. 1 parrot bird 24.0
  5072. 2 lion mammal 80.5
  5073. 3 monkey mammal NaN
  5074. >>> df.pop('class')
  5075. 0 bird
  5076. 1 bird
  5077. 2 mammal
  5078. 3 mammal
  5079. Name: class, dtype: object
  5080. >>> df
  5081. name max_speed
  5082. 0 falcon 389.0
  5083. 1 parrot 24.0
  5084. 2 lion 80.5
  5085. 3 monkey NaN
  5086. """
  5087. return super().pop(item=item)
  5088. def _replace_columnwise(
  5089. self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
  5090. ):
  5091. """
  5092. Dispatch to Series.replace column-wise.
  5093. Parameters
  5094. ----------
  5095. mapping : dict
  5096. of the form {col: (target, value)}
  5097. inplace : bool
  5098. regex : bool or same types as `to_replace` in DataFrame.replace
  5099. Returns
  5100. -------
  5101. DataFrame or None
  5102. """
  5103. # Operate column-wise
  5104. res = self if inplace else self.copy(deep=None)
  5105. ax = self.columns
  5106. for i, ax_value in enumerate(ax):
  5107. if ax_value in mapping:
  5108. ser = self.iloc[:, i]
  5109. target, value = mapping[ax_value]
  5110. newobj = ser.replace(target, value, regex=regex)
  5111. res._iset_item(i, newobj, inplace=inplace)
  5112. if inplace:
  5113. return
  5114. return res.__finalize__(self)
  5115. @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
  5116. def shift(
  5117. self,
  5118. periods: int | Sequence[int] = 1,
  5119. freq: Frequency | None = None,
  5120. axis: Axis = 0,
  5121. fill_value: Hashable = lib.no_default,
  5122. suffix: str | None = None,
  5123. ) -> DataFrame:
  5124. if freq is not None and fill_value is not lib.no_default:
  5125. # GH#53832
  5126. warnings.warn(
  5127. "Passing a 'freq' together with a 'fill_value' silently ignores "
  5128. "the fill_value and is deprecated. This will raise in a future "
  5129. "version.",
  5130. FutureWarning,
  5131. stacklevel=find_stack_level(),
  5132. )
  5133. fill_value = lib.no_default
  5134. if self.empty:
  5135. return self.copy()
  5136. axis = self._get_axis_number(axis)
  5137. if is_list_like(periods):
  5138. periods = cast(Sequence, periods)
  5139. if axis == 1:
  5140. raise ValueError(
  5141. "If `periods` contains multiple shifts, `axis` cannot be 1."
  5142. )
  5143. if len(periods) == 0:
  5144. raise ValueError("If `periods` is an iterable, it cannot be empty.")
  5145. from pandas.core.reshape.concat import concat
  5146. shifted_dataframes = []
  5147. for period in periods:
  5148. if not is_integer(period):
  5149. raise TypeError(
  5150. f"Periods must be integer, but {period} is {type(period)}."
  5151. )
  5152. period = cast(int, period)
  5153. shifted_dataframes.append(
  5154. super()
  5155. .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)
  5156. .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")
  5157. )
  5158. return concat(shifted_dataframes, axis=1)
  5159. elif suffix:
  5160. raise ValueError("Cannot specify `suffix` if `periods` is an int.")
  5161. periods = cast(int, periods)
  5162. ncols = len(self.columns)
  5163. arrays = self._mgr.arrays
  5164. if axis == 1 and periods != 0 and ncols > 0 and freq is None:
  5165. if fill_value is lib.no_default:
  5166. # We will infer fill_value to match the closest column
  5167. # Use a column that we know is valid for our column's dtype GH#38434
  5168. label = self.columns[0]
  5169. if periods > 0:
  5170. result = self.iloc[:, :-periods]
  5171. for col in range(min(ncols, abs(periods))):
  5172. # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
  5173. # Define filler inside loop so we get a copy
  5174. filler = self.iloc[:, 0].shift(len(self))
  5175. result.insert(0, label, filler, allow_duplicates=True)
  5176. else:
  5177. result = self.iloc[:, -periods:]
  5178. for col in range(min(ncols, abs(periods))):
  5179. # Define filler inside loop so we get a copy
  5180. filler = self.iloc[:, -1].shift(len(self))
  5181. result.insert(
  5182. len(result.columns), label, filler, allow_duplicates=True
  5183. )
  5184. result.columns = self.columns.copy()
  5185. return result
  5186. elif len(arrays) > 1 or (
  5187. # If we only have one block and we know that we can't
  5188. # keep the same dtype (i.e. the _can_hold_element check)
  5189. # then we can go through the reindex_indexer path
  5190. # (and avoid casting logic in the Block method).
  5191. not can_hold_element(arrays[0], fill_value)
  5192. ):
  5193. # GH#35488 we need to watch out for multi-block cases
  5194. # We only get here with fill_value not-lib.no_default
  5195. nper = abs(periods)
  5196. nper = min(nper, ncols)
  5197. if periods > 0:
  5198. indexer = np.array(
  5199. [-1] * nper + list(range(ncols - periods)), dtype=np.intp
  5200. )
  5201. else:
  5202. indexer = np.array(
  5203. list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
  5204. )
  5205. mgr = self._mgr.reindex_indexer(
  5206. self.columns,
  5207. indexer,
  5208. axis=0,
  5209. fill_value=fill_value,
  5210. allow_dups=True,
  5211. )
  5212. res_df = self._constructor_from_mgr(mgr, axes=mgr.axes)
  5213. return res_df.__finalize__(self, method="shift")
  5214. else:
  5215. return self.T.shift(periods=periods, fill_value=fill_value).T
  5216. return super().shift(
  5217. periods=periods, freq=freq, axis=axis, fill_value=fill_value
  5218. )
  5219. @overload
  5220. def set_index(
  5221. self,
  5222. keys,
  5223. *,
  5224. drop: bool = ...,
  5225. append: bool = ...,
  5226. inplace: Literal[False] = ...,
  5227. verify_integrity: bool = ...,
  5228. ) -> DataFrame:
  5229. ...
  5230. @overload
  5231. def set_index(
  5232. self,
  5233. keys,
  5234. *,
  5235. drop: bool = ...,
  5236. append: bool = ...,
  5237. inplace: Literal[True],
  5238. verify_integrity: bool = ...,
  5239. ) -> None:
  5240. ...
  5241. def set_index(
  5242. self,
  5243. keys,
  5244. *,
  5245. drop: bool = True,
  5246. append: bool = False,
  5247. inplace: bool = False,
  5248. verify_integrity: bool = False,
  5249. ) -> DataFrame | None:
  5250. """
  5251. Set the DataFrame index using existing columns.
  5252. Set the DataFrame index (row labels) using one or more existing
  5253. columns or arrays (of the correct length). The index can replace the
  5254. existing index or expand on it.
  5255. Parameters
  5256. ----------
  5257. keys : label or array-like or list of labels/arrays
  5258. This parameter can be either a single column key, a single array of
  5259. the same length as the calling DataFrame, or a list containing an
  5260. arbitrary combination of column keys and arrays. Here, "array"
  5261. encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
  5262. instances of :class:`~collections.abc.Iterator`.
  5263. drop : bool, default True
  5264. Delete columns to be used as the new index.
  5265. append : bool, default False
  5266. Whether to append columns to existing index.
  5267. inplace : bool, default False
  5268. Whether to modify the DataFrame rather than creating a new one.
  5269. verify_integrity : bool, default False
  5270. Check the new index for duplicates. Otherwise defer the check until
  5271. necessary. Setting to False will improve the performance of this
  5272. method.
  5273. Returns
  5274. -------
  5275. DataFrame or None
  5276. Changed row labels or None if ``inplace=True``.
  5277. See Also
  5278. --------
  5279. DataFrame.reset_index : Opposite of set_index.
  5280. DataFrame.reindex : Change to new indices or expand indices.
  5281. DataFrame.reindex_like : Change to same indices as other DataFrame.
  5282. Examples
  5283. --------
  5284. >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
  5285. ... 'year': [2012, 2014, 2013, 2014],
  5286. ... 'sale': [55, 40, 84, 31]})
  5287. >>> df
  5288. month year sale
  5289. 0 1 2012 55
  5290. 1 4 2014 40
  5291. 2 7 2013 84
  5292. 3 10 2014 31
  5293. Set the index to become the 'month' column:
  5294. >>> df.set_index('month')
  5295. year sale
  5296. month
  5297. 1 2012 55
  5298. 4 2014 40
  5299. 7 2013 84
  5300. 10 2014 31
  5301. Create a MultiIndex using columns 'year' and 'month':
  5302. >>> df.set_index(['year', 'month'])
  5303. sale
  5304. year month
  5305. 2012 1 55
  5306. 2014 4 40
  5307. 2013 7 84
  5308. 2014 10 31
  5309. Create a MultiIndex using an Index and a column:
  5310. >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
  5311. month sale
  5312. year
  5313. 1 2012 1 55
  5314. 2 2014 4 40
  5315. 3 2013 7 84
  5316. 4 2014 10 31
  5317. Create a MultiIndex using two Series:
  5318. >>> s = pd.Series([1, 2, 3, 4])
  5319. >>> df.set_index([s, s**2])
  5320. month year sale
  5321. 1 1 1 2012 55
  5322. 2 4 4 2014 40
  5323. 3 9 7 2013 84
  5324. 4 16 10 2014 31
  5325. """
  5326. inplace = validate_bool_kwarg(inplace, "inplace")
  5327. self._check_inplace_and_allows_duplicate_labels(inplace)
  5328. if not isinstance(keys, list):
  5329. keys = [keys]
  5330. err_msg = (
  5331. 'The parameter "keys" may be a column key, one-dimensional '
  5332. "array, or a list containing only valid column keys and "
  5333. "one-dimensional arrays."
  5334. )
  5335. missing: list[Hashable] = []
  5336. for col in keys:
  5337. if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
  5338. # arrays are fine as long as they are one-dimensional
  5339. # iterators get converted to list below
  5340. if getattr(col, "ndim", 1) != 1:
  5341. raise ValueError(err_msg)
  5342. else:
  5343. # everything else gets tried as a key; see GH 24969
  5344. try:
  5345. found = col in self.columns
  5346. except TypeError as err:
  5347. raise TypeError(
  5348. f"{err_msg}. Received column of type {type(col)}"
  5349. ) from err
  5350. else:
  5351. if not found:
  5352. missing.append(col)
  5353. if missing:
  5354. raise KeyError(f"None of {missing} are in the columns")
  5355. if inplace:
  5356. frame = self
  5357. else:
  5358. # GH 49473 Use "lazy copy" with Copy-on-Write
  5359. frame = self.copy(deep=None)
  5360. arrays: list[Index] = []
  5361. names: list[Hashable] = []
  5362. if append:
  5363. names = list(self.index.names)
  5364. if isinstance(self.index, MultiIndex):
  5365. arrays.extend(
  5366. self.index._get_level_values(i) for i in range(self.index.nlevels)
  5367. )
  5368. else:
  5369. arrays.append(self.index)
  5370. to_remove: list[Hashable] = []
  5371. for col in keys:
  5372. if isinstance(col, MultiIndex):
  5373. arrays.extend(col._get_level_values(n) for n in range(col.nlevels))
  5374. names.extend(col.names)
  5375. elif isinstance(col, (Index, Series)):
  5376. # if Index then not MultiIndex (treated above)
  5377. # error: Argument 1 to "append" of "list" has incompatible type
  5378. # "Union[Index, Series]"; expected "Index"
  5379. arrays.append(col) # type: ignore[arg-type]
  5380. names.append(col.name)
  5381. elif isinstance(col, (list, np.ndarray)):
  5382. # error: Argument 1 to "append" of "list" has incompatible type
  5383. # "Union[List[Any], ndarray]"; expected "Index"
  5384. arrays.append(col) # type: ignore[arg-type]
  5385. names.append(None)
  5386. elif isinstance(col, abc.Iterator):
  5387. # error: Argument 1 to "append" of "list" has incompatible type
  5388. # "List[Any]"; expected "Index"
  5389. arrays.append(list(col)) # type: ignore[arg-type]
  5390. names.append(None)
  5391. # from here, col can only be a column label
  5392. else:
  5393. arrays.append(frame[col])
  5394. names.append(col)
  5395. if drop:
  5396. to_remove.append(col)
  5397. if len(arrays[-1]) != len(self):
  5398. # check newest element against length of calling frame, since
  5399. # ensure_index_from_sequences would not raise for append=False.
  5400. raise ValueError(
  5401. f"Length mismatch: Expected {len(self)} rows, "
  5402. f"received array of length {len(arrays[-1])}"
  5403. )
  5404. index = ensure_index_from_sequences(arrays, names)
  5405. if verify_integrity and not index.is_unique:
  5406. duplicates = index[index.duplicated()].unique()
  5407. raise ValueError(f"Index has duplicate keys: {duplicates}")
  5408. # use set to handle duplicate column names gracefully in case of drop
  5409. for c in set(to_remove):
  5410. del frame[c]
  5411. # clear up memory usage
  5412. index._cleanup()
  5413. frame.index = index
  5414. if not inplace:
  5415. return frame
  5416. return None
  5417. @overload
  5418. def reset_index(
  5419. self,
  5420. level: IndexLabel = ...,
  5421. *,
  5422. drop: bool = ...,
  5423. inplace: Literal[False] = ...,
  5424. col_level: Hashable = ...,
  5425. col_fill: Hashable = ...,
  5426. allow_duplicates: bool | lib.NoDefault = ...,
  5427. names: Hashable | Sequence[Hashable] | None = None,
  5428. ) -> DataFrame:
  5429. ...
  5430. @overload
  5431. def reset_index(
  5432. self,
  5433. level: IndexLabel = ...,
  5434. *,
  5435. drop: bool = ...,
  5436. inplace: Literal[True],
  5437. col_level: Hashable = ...,
  5438. col_fill: Hashable = ...,
  5439. allow_duplicates: bool | lib.NoDefault = ...,
  5440. names: Hashable | Sequence[Hashable] | None = None,
  5441. ) -> None:
  5442. ...
  5443. @overload
  5444. def reset_index(
  5445. self,
  5446. level: IndexLabel = ...,
  5447. *,
  5448. drop: bool = ...,
  5449. inplace: bool = ...,
  5450. col_level: Hashable = ...,
  5451. col_fill: Hashable = ...,
  5452. allow_duplicates: bool | lib.NoDefault = ...,
  5453. names: Hashable | Sequence[Hashable] | None = None,
  5454. ) -> DataFrame | None:
  5455. ...
  5456. def reset_index(
  5457. self,
  5458. level: IndexLabel | None = None,
  5459. *,
  5460. drop: bool = False,
  5461. inplace: bool = False,
  5462. col_level: Hashable = 0,
  5463. col_fill: Hashable = "",
  5464. allow_duplicates: bool | lib.NoDefault = lib.no_default,
  5465. names: Hashable | Sequence[Hashable] | None = None,
  5466. ) -> DataFrame | None:
  5467. """
  5468. Reset the index, or a level of it.
  5469. Reset the index of the DataFrame, and use the default one instead.
  5470. If the DataFrame has a MultiIndex, this method can remove one or more
  5471. levels.
  5472. Parameters
  5473. ----------
  5474. level : int, str, tuple, or list, default None
  5475. Only remove the given levels from the index. Removes all levels by
  5476. default.
  5477. drop : bool, default False
  5478. Do not try to insert index into dataframe columns. This resets
  5479. the index to the default integer index.
  5480. inplace : bool, default False
  5481. Whether to modify the DataFrame rather than creating a new one.
  5482. col_level : int or str, default 0
  5483. If the columns have multiple levels, determines which level the
  5484. labels are inserted into. By default it is inserted into the first
  5485. level.
  5486. col_fill : object, default ''
  5487. If the columns have multiple levels, determines how the other
  5488. levels are named. If None then the index name is repeated.
  5489. allow_duplicates : bool, optional, default lib.no_default
  5490. Allow duplicate column labels to be created.
  5491. .. versionadded:: 1.5.0
  5492. names : int, str or 1-dimensional list, default None
  5493. Using the given string, rename the DataFrame column which contains the
  5494. index data. If the DataFrame has a MultiIndex, this has to be a list or
  5495. tuple with length equal to the number of levels.
  5496. .. versionadded:: 1.5.0
  5497. Returns
  5498. -------
  5499. DataFrame or None
  5500. DataFrame with the new index or None if ``inplace=True``.
  5501. See Also
  5502. --------
  5503. DataFrame.set_index : Opposite of reset_index.
  5504. DataFrame.reindex : Change to new indices or expand indices.
  5505. DataFrame.reindex_like : Change to same indices as other DataFrame.
  5506. Examples
  5507. --------
  5508. >>> df = pd.DataFrame([('bird', 389.0),
  5509. ... ('bird', 24.0),
  5510. ... ('mammal', 80.5),
  5511. ... ('mammal', np.nan)],
  5512. ... index=['falcon', 'parrot', 'lion', 'monkey'],
  5513. ... columns=('class', 'max_speed'))
  5514. >>> df
  5515. class max_speed
  5516. falcon bird 389.0
  5517. parrot bird 24.0
  5518. lion mammal 80.5
  5519. monkey mammal NaN
  5520. When we reset the index, the old index is added as a column, and a
  5521. new sequential index is used:
  5522. >>> df.reset_index()
  5523. index class max_speed
  5524. 0 falcon bird 389.0
  5525. 1 parrot bird 24.0
  5526. 2 lion mammal 80.5
  5527. 3 monkey mammal NaN
  5528. We can use the `drop` parameter to avoid the old index being added as
  5529. a column:
  5530. >>> df.reset_index(drop=True)
  5531. class max_speed
  5532. 0 bird 389.0
  5533. 1 bird 24.0
  5534. 2 mammal 80.5
  5535. 3 mammal NaN
  5536. You can also use `reset_index` with `MultiIndex`.
  5537. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
  5538. ... ('bird', 'parrot'),
  5539. ... ('mammal', 'lion'),
  5540. ... ('mammal', 'monkey')],
  5541. ... names=['class', 'name'])
  5542. >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
  5543. ... ('species', 'type')])
  5544. >>> df = pd.DataFrame([(389.0, 'fly'),
  5545. ... (24.0, 'fly'),
  5546. ... (80.5, 'run'),
  5547. ... (np.nan, 'jump')],
  5548. ... index=index,
  5549. ... columns=columns)
  5550. >>> df
  5551. speed species
  5552. max type
  5553. class name
  5554. bird falcon 389.0 fly
  5555. parrot 24.0 fly
  5556. mammal lion 80.5 run
  5557. monkey NaN jump
  5558. Using the `names` parameter, choose a name for the index column:
  5559. >>> df.reset_index(names=['classes', 'names'])
  5560. classes names speed species
  5561. max type
  5562. 0 bird falcon 389.0 fly
  5563. 1 bird parrot 24.0 fly
  5564. 2 mammal lion 80.5 run
  5565. 3 mammal monkey NaN jump
  5566. If the index has multiple levels, we can reset a subset of them:
  5567. >>> df.reset_index(level='class')
  5568. class speed species
  5569. max type
  5570. name
  5571. falcon bird 389.0 fly
  5572. parrot bird 24.0 fly
  5573. lion mammal 80.5 run
  5574. monkey mammal NaN jump
  5575. If we are not dropping the index, by default, it is placed in the top
  5576. level. We can place it in another level:
  5577. >>> df.reset_index(level='class', col_level=1)
  5578. speed species
  5579. class max type
  5580. name
  5581. falcon bird 389.0 fly
  5582. parrot bird 24.0 fly
  5583. lion mammal 80.5 run
  5584. monkey mammal NaN jump
  5585. When the index is inserted under another level, we can specify under
  5586. which one with the parameter `col_fill`:
  5587. >>> df.reset_index(level='class', col_level=1, col_fill='species')
  5588. species speed species
  5589. class max type
  5590. name
  5591. falcon bird 389.0 fly
  5592. parrot bird 24.0 fly
  5593. lion mammal 80.5 run
  5594. monkey mammal NaN jump
  5595. If we specify a nonexistent level for `col_fill`, it is created:
  5596. >>> df.reset_index(level='class', col_level=1, col_fill='genus')
  5597. genus speed species
  5598. class max type
  5599. name
  5600. falcon bird 389.0 fly
  5601. parrot bird 24.0 fly
  5602. lion mammal 80.5 run
  5603. monkey mammal NaN jump
  5604. """
  5605. inplace = validate_bool_kwarg(inplace, "inplace")
  5606. self._check_inplace_and_allows_duplicate_labels(inplace)
  5607. if inplace:
  5608. new_obj = self
  5609. else:
  5610. new_obj = self.copy(deep=None)
  5611. if allow_duplicates is not lib.no_default:
  5612. allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
  5613. new_index = default_index(len(new_obj))
  5614. if level is not None:
  5615. if not isinstance(level, (tuple, list)):
  5616. level = [level]
  5617. level = [self.index._get_level_number(lev) for lev in level]
  5618. if len(level) < self.index.nlevels:
  5619. new_index = self.index.droplevel(level)
  5620. if not drop:
  5621. to_insert: Iterable[tuple[Any, Any | None]]
  5622. default = "index" if "index" not in self else "level_0"
  5623. names = self.index._get_default_index_names(names, default)
  5624. if isinstance(self.index, MultiIndex):
  5625. to_insert = zip(self.index.levels, self.index.codes)
  5626. else:
  5627. to_insert = ((self.index, None),)
  5628. multi_col = isinstance(self.columns, MultiIndex)
  5629. for i, (lev, lab) in reversed(list(enumerate(to_insert))):
  5630. if level is not None and i not in level:
  5631. continue
  5632. name = names[i]
  5633. if multi_col:
  5634. col_name = list(name) if isinstance(name, tuple) else [name]
  5635. if col_fill is None:
  5636. if len(col_name) not in (1, self.columns.nlevels):
  5637. raise ValueError(
  5638. "col_fill=None is incompatible "
  5639. f"with incomplete column name {name}"
  5640. )
  5641. col_fill = col_name[0]
  5642. lev_num = self.columns._get_level_number(col_level)
  5643. name_lst = [col_fill] * lev_num + col_name
  5644. missing = self.columns.nlevels - len(name_lst)
  5645. name_lst += [col_fill] * missing
  5646. name = tuple(name_lst)
  5647. # to ndarray and maybe infer different dtype
  5648. level_values = lev._values
  5649. if level_values.dtype == np.object_:
  5650. level_values = lib.maybe_convert_objects(level_values)
  5651. if lab is not None:
  5652. # if we have the codes, extract the values with a mask
  5653. level_values = algorithms.take(
  5654. level_values, lab, allow_fill=True, fill_value=lev._na_value
  5655. )
  5656. new_obj.insert(
  5657. 0,
  5658. name,
  5659. level_values,
  5660. allow_duplicates=allow_duplicates,
  5661. )
  5662. new_obj.index = new_index
  5663. if not inplace:
  5664. return new_obj
  5665. return None
  5666. # ----------------------------------------------------------------------
  5667. # Reindex-based selection methods
  5668. @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
  5669. def isna(self) -> DataFrame:
  5670. res_mgr = self._mgr.isna(func=isna)
  5671. result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes)
  5672. return result.__finalize__(self, method="isna")
  5673. @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
  5674. def isnull(self) -> DataFrame:
  5675. """
  5676. DataFrame.isnull is an alias for DataFrame.isna.
  5677. """
  5678. return self.isna()
  5679. @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
  5680. def notna(self) -> DataFrame:
  5681. return ~self.isna()
  5682. @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
  5683. def notnull(self) -> DataFrame:
  5684. """
  5685. DataFrame.notnull is an alias for DataFrame.notna.
  5686. """
  5687. return ~self.isna()
  5688. @overload
  5689. def dropna(
  5690. self,
  5691. *,
  5692. axis: Axis = ...,
  5693. how: AnyAll | lib.NoDefault = ...,
  5694. thresh: int | lib.NoDefault = ...,
  5695. subset: IndexLabel = ...,
  5696. inplace: Literal[False] = ...,
  5697. ignore_index: bool = ...,
  5698. ) -> DataFrame:
  5699. ...
  5700. @overload
  5701. def dropna(
  5702. self,
  5703. *,
  5704. axis: Axis = ...,
  5705. how: AnyAll | lib.NoDefault = ...,
  5706. thresh: int | lib.NoDefault = ...,
  5707. subset: IndexLabel = ...,
  5708. inplace: Literal[True],
  5709. ignore_index: bool = ...,
  5710. ) -> None:
  5711. ...
  5712. def dropna(
  5713. self,
  5714. *,
  5715. axis: Axis = 0,
  5716. how: AnyAll | lib.NoDefault = lib.no_default,
  5717. thresh: int | lib.NoDefault = lib.no_default,
  5718. subset: IndexLabel | None = None,
  5719. inplace: bool = False,
  5720. ignore_index: bool = False,
  5721. ) -> DataFrame | None:
  5722. """
  5723. Remove missing values.
  5724. See the :ref:`User Guide <missing_data>` for more on which values are
  5725. considered missing, and how to work with missing data.
  5726. Parameters
  5727. ----------
  5728. axis : {0 or 'index', 1 or 'columns'}, default 0
  5729. Determine if rows or columns which contain missing values are
  5730. removed.
  5731. * 0, or 'index' : Drop rows which contain missing values.
  5732. * 1, or 'columns' : Drop columns which contain missing value.
  5733. Only a single axis is allowed.
  5734. how : {'any', 'all'}, default 'any'
  5735. Determine if row or column is removed from DataFrame, when we have
  5736. at least one NA or all NA.
  5737. * 'any' : If any NA values are present, drop that row or column.
  5738. * 'all' : If all values are NA, drop that row or column.
  5739. thresh : int, optional
  5740. Require that many non-NA values. Cannot be combined with how.
  5741. subset : column label or sequence of labels, optional
  5742. Labels along other axis to consider, e.g. if you are dropping rows
  5743. these would be a list of columns to include.
  5744. inplace : bool, default False
  5745. Whether to modify the DataFrame rather than creating a new one.
  5746. ignore_index : bool, default ``False``
  5747. If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
  5748. .. versionadded:: 2.0.0
  5749. Returns
  5750. -------
  5751. DataFrame or None
  5752. DataFrame with NA entries dropped from it or None if ``inplace=True``.
  5753. See Also
  5754. --------
  5755. DataFrame.isna: Indicate missing values.
  5756. DataFrame.notna : Indicate existing (non-missing) values.
  5757. DataFrame.fillna : Replace missing values.
  5758. Series.dropna : Drop missing values.
  5759. Index.dropna : Drop missing indices.
  5760. Examples
  5761. --------
  5762. >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
  5763. ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
  5764. ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
  5765. ... pd.NaT]})
  5766. >>> df
  5767. name toy born
  5768. 0 Alfred NaN NaT
  5769. 1 Batman Batmobile 1940-04-25
  5770. 2 Catwoman Bullwhip NaT
  5771. Drop the rows where at least one element is missing.
  5772. >>> df.dropna()
  5773. name toy born
  5774. 1 Batman Batmobile 1940-04-25
  5775. Drop the columns where at least one element is missing.
  5776. >>> df.dropna(axis='columns')
  5777. name
  5778. 0 Alfred
  5779. 1 Batman
  5780. 2 Catwoman
  5781. Drop the rows where all elements are missing.
  5782. >>> df.dropna(how='all')
  5783. name toy born
  5784. 0 Alfred NaN NaT
  5785. 1 Batman Batmobile 1940-04-25
  5786. 2 Catwoman Bullwhip NaT
  5787. Keep only the rows with at least 2 non-NA values.
  5788. >>> df.dropna(thresh=2)
  5789. name toy born
  5790. 1 Batman Batmobile 1940-04-25
  5791. 2 Catwoman Bullwhip NaT
  5792. Define in which columns to look for missing values.
  5793. >>> df.dropna(subset=['name', 'toy'])
  5794. name toy born
  5795. 1 Batman Batmobile 1940-04-25
  5796. 2 Catwoman Bullwhip NaT
  5797. """
  5798. if (how is not lib.no_default) and (thresh is not lib.no_default):
  5799. raise TypeError(
  5800. "You cannot set both the how and thresh arguments at the same time."
  5801. )
  5802. if how is lib.no_default:
  5803. how = "any"
  5804. inplace = validate_bool_kwarg(inplace, "inplace")
  5805. if isinstance(axis, (tuple, list)):
  5806. # GH20987
  5807. raise TypeError("supplying multiple axes to axis is no longer supported.")
  5808. axis = self._get_axis_number(axis)
  5809. agg_axis = 1 - axis
  5810. agg_obj = self
  5811. if subset is not None:
  5812. # subset needs to be list
  5813. if not is_list_like(subset):
  5814. subset = [subset]
  5815. ax = self._get_axis(agg_axis)
  5816. indices = ax.get_indexer_for(subset)
  5817. check = indices == -1
  5818. if check.any():
  5819. raise KeyError(np.array(subset)[check].tolist())
  5820. agg_obj = self.take(indices, axis=agg_axis)
  5821. if thresh is not lib.no_default:
  5822. count = agg_obj.count(axis=agg_axis)
  5823. mask = count >= thresh
  5824. elif how == "any":
  5825. # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
  5826. mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
  5827. elif how == "all":
  5828. # faster equivalent to 'agg_obj.count(agg_axis) > 0'
  5829. mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
  5830. else:
  5831. raise ValueError(f"invalid how option: {how}")
  5832. if np.all(mask):
  5833. result = self.copy(deep=None)
  5834. else:
  5835. result = self.loc(axis=axis)[mask]
  5836. if ignore_index:
  5837. result.index = default_index(len(result))
  5838. if not inplace:
  5839. return result
  5840. self._update_inplace(result)
  5841. return None
  5842. @overload
  5843. def drop_duplicates(
  5844. self,
  5845. subset: Hashable | Sequence[Hashable] | None = ...,
  5846. *,
  5847. keep: DropKeep = ...,
  5848. inplace: Literal[True],
  5849. ignore_index: bool = ...,
  5850. ) -> None:
  5851. ...
  5852. @overload
  5853. def drop_duplicates(
  5854. self,
  5855. subset: Hashable | Sequence[Hashable] | None = ...,
  5856. *,
  5857. keep: DropKeep = ...,
  5858. inplace: Literal[False] = ...,
  5859. ignore_index: bool = ...,
  5860. ) -> DataFrame:
  5861. ...
  5862. @overload
  5863. def drop_duplicates(
  5864. self,
  5865. subset: Hashable | Sequence[Hashable] | None = ...,
  5866. *,
  5867. keep: DropKeep = ...,
  5868. inplace: bool = ...,
  5869. ignore_index: bool = ...,
  5870. ) -> DataFrame | None:
  5871. ...
  5872. def drop_duplicates(
  5873. self,
  5874. subset: Hashable | Sequence[Hashable] | None = None,
  5875. *,
  5876. keep: DropKeep = "first",
  5877. inplace: bool = False,
  5878. ignore_index: bool = False,
  5879. ) -> DataFrame | None:
  5880. """
  5881. Return DataFrame with duplicate rows removed.
  5882. Considering certain columns is optional. Indexes, including time indexes
  5883. are ignored.
  5884. Parameters
  5885. ----------
  5886. subset : column label or sequence of labels, optional
  5887. Only consider certain columns for identifying duplicates, by
  5888. default use all of the columns.
  5889. keep : {'first', 'last', ``False``}, default 'first'
  5890. Determines which duplicates (if any) to keep.
  5891. - 'first' : Drop duplicates except for the first occurrence.
  5892. - 'last' : Drop duplicates except for the last occurrence.
  5893. - ``False`` : Drop all duplicates.
  5894. inplace : bool, default ``False``
  5895. Whether to modify the DataFrame rather than creating a new one.
  5896. ignore_index : bool, default ``False``
  5897. If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
  5898. Returns
  5899. -------
  5900. DataFrame or None
  5901. DataFrame with duplicates removed or None if ``inplace=True``.
  5902. See Also
  5903. --------
  5904. DataFrame.value_counts: Count unique combinations of columns.
  5905. Examples
  5906. --------
  5907. Consider dataset containing ramen rating.
  5908. >>> df = pd.DataFrame({
  5909. ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  5910. ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  5911. ... 'rating': [4, 4, 3.5, 15, 5]
  5912. ... })
  5913. >>> df
  5914. brand style rating
  5915. 0 Yum Yum cup 4.0
  5916. 1 Yum Yum cup 4.0
  5917. 2 Indomie cup 3.5
  5918. 3 Indomie pack 15.0
  5919. 4 Indomie pack 5.0
  5920. By default, it removes duplicate rows based on all columns.
  5921. >>> df.drop_duplicates()
  5922. brand style rating
  5923. 0 Yum Yum cup 4.0
  5924. 2 Indomie cup 3.5
  5925. 3 Indomie pack 15.0
  5926. 4 Indomie pack 5.0
  5927. To remove duplicates on specific column(s), use ``subset``.
  5928. >>> df.drop_duplicates(subset=['brand'])
  5929. brand style rating
  5930. 0 Yum Yum cup 4.0
  5931. 2 Indomie cup 3.5
  5932. To remove duplicates and keep last occurrences, use ``keep``.
  5933. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
  5934. brand style rating
  5935. 1 Yum Yum cup 4.0
  5936. 2 Indomie cup 3.5
  5937. 4 Indomie pack 5.0
  5938. """
  5939. if self.empty:
  5940. return self.copy(deep=None)
  5941. inplace = validate_bool_kwarg(inplace, "inplace")
  5942. ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
  5943. result = self[-self.duplicated(subset, keep=keep)]
  5944. if ignore_index:
  5945. result.index = default_index(len(result))
  5946. if inplace:
  5947. self._update_inplace(result)
  5948. return None
  5949. else:
  5950. return result
  5951. def duplicated(
  5952. self,
  5953. subset: Hashable | Sequence[Hashable] | None = None,
  5954. keep: DropKeep = "first",
  5955. ) -> Series:
  5956. """
  5957. Return boolean Series denoting duplicate rows.
  5958. Considering certain columns is optional.
  5959. Parameters
  5960. ----------
  5961. subset : column label or sequence of labels, optional
  5962. Only consider certain columns for identifying duplicates, by
  5963. default use all of the columns.
  5964. keep : {'first', 'last', False}, default 'first'
  5965. Determines which duplicates (if any) to mark.
  5966. - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
  5967. - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
  5968. - False : Mark all duplicates as ``True``.
  5969. Returns
  5970. -------
  5971. Series
  5972. Boolean series for each duplicated rows.
  5973. See Also
  5974. --------
  5975. Index.duplicated : Equivalent method on index.
  5976. Series.duplicated : Equivalent method on Series.
  5977. Series.drop_duplicates : Remove duplicate values from Series.
  5978. DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
  5979. Examples
  5980. --------
  5981. Consider dataset containing ramen rating.
  5982. >>> df = pd.DataFrame({
  5983. ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  5984. ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  5985. ... 'rating': [4, 4, 3.5, 15, 5]
  5986. ... })
  5987. >>> df
  5988. brand style rating
  5989. 0 Yum Yum cup 4.0
  5990. 1 Yum Yum cup 4.0
  5991. 2 Indomie cup 3.5
  5992. 3 Indomie pack 15.0
  5993. 4 Indomie pack 5.0
  5994. By default, for each set of duplicated values, the first occurrence
  5995. is set on False and all others on True.
  5996. >>> df.duplicated()
  5997. 0 False
  5998. 1 True
  5999. 2 False
  6000. 3 False
  6001. 4 False
  6002. dtype: bool
  6003. By using 'last', the last occurrence of each set of duplicated values
  6004. is set on False and all others on True.
  6005. >>> df.duplicated(keep='last')
  6006. 0 True
  6007. 1 False
  6008. 2 False
  6009. 3 False
  6010. 4 False
  6011. dtype: bool
  6012. By setting ``keep`` on False, all duplicates are True.
  6013. >>> df.duplicated(keep=False)
  6014. 0 True
  6015. 1 True
  6016. 2 False
  6017. 3 False
  6018. 4 False
  6019. dtype: bool
  6020. To find duplicates on specific column(s), use ``subset``.
  6021. >>> df.duplicated(subset=['brand'])
  6022. 0 False
  6023. 1 True
  6024. 2 False
  6025. 3 True
  6026. 4 True
  6027. dtype: bool
  6028. """
  6029. if self.empty:
  6030. return self._constructor_sliced(dtype=bool)
  6031. def f(vals) -> tuple[np.ndarray, int]:
  6032. labels, shape = algorithms.factorize(vals, size_hint=len(self))
  6033. return labels.astype("i8", copy=False), len(shape)
  6034. if subset is None:
  6035. # https://github.com/pandas-dev/pandas/issues/28770
  6036. # Incompatible types in assignment (expression has type "Index", variable
  6037. # has type "Sequence[Any]")
  6038. subset = self.columns # type: ignore[assignment]
  6039. elif (
  6040. not np.iterable(subset)
  6041. or isinstance(subset, str)
  6042. or isinstance(subset, tuple)
  6043. and subset in self.columns
  6044. ):
  6045. subset = (subset,)
  6046. # needed for mypy since can't narrow types using np.iterable
  6047. subset = cast(Sequence, subset)
  6048. # Verify all columns in subset exist in the queried dataframe
  6049. # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
  6050. # key that doesn't exist.
  6051. diff = set(subset) - set(self.columns)
  6052. if diff:
  6053. raise KeyError(Index(diff))
  6054. if len(subset) == 1 and self.columns.is_unique:
  6055. # GH#45236 This is faster than get_group_index below
  6056. result = self[subset[0]].duplicated(keep)
  6057. result.name = None
  6058. else:
  6059. vals = (col.values for name, col in self.items() if name in subset)
  6060. labels, shape = map(list, zip(*map(f, vals)))
  6061. ids = get_group_index(labels, tuple(shape), sort=False, xnull=False)
  6062. result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
  6063. return result.__finalize__(self, method="duplicated")
  6064. # ----------------------------------------------------------------------
  6065. # Sorting
  6066. # error: Signature of "sort_values" incompatible with supertype "NDFrame"
  6067. @overload # type: ignore[override]
  6068. def sort_values(
  6069. self,
  6070. by: IndexLabel,
  6071. *,
  6072. axis: Axis = ...,
  6073. ascending=...,
  6074. inplace: Literal[False] = ...,
  6075. kind: SortKind = ...,
  6076. na_position: NaPosition = ...,
  6077. ignore_index: bool = ...,
  6078. key: ValueKeyFunc = ...,
  6079. ) -> DataFrame:
  6080. ...
  6081. @overload
  6082. def sort_values(
  6083. self,
  6084. by: IndexLabel,
  6085. *,
  6086. axis: Axis = ...,
  6087. ascending=...,
  6088. inplace: Literal[True],
  6089. kind: SortKind = ...,
  6090. na_position: str = ...,
  6091. ignore_index: bool = ...,
  6092. key: ValueKeyFunc = ...,
  6093. ) -> None:
  6094. ...
  6095. def sort_values(
  6096. self,
  6097. by: IndexLabel,
  6098. *,
  6099. axis: Axis = 0,
  6100. ascending: bool | list[bool] | tuple[bool, ...] = True,
  6101. inplace: bool = False,
  6102. kind: SortKind = "quicksort",
  6103. na_position: str = "last",
  6104. ignore_index: bool = False,
  6105. key: ValueKeyFunc | None = None,
  6106. ) -> DataFrame | None:
  6107. """
  6108. Sort by the values along either axis.
  6109. Parameters
  6110. ----------
  6111. by : str or list of str
  6112. Name or list of names to sort by.
  6113. - if `axis` is 0 or `'index'` then `by` may contain index
  6114. levels and/or column labels.
  6115. - if `axis` is 1 or `'columns'` then `by` may contain column
  6116. levels and/or index labels.
  6117. axis : "{0 or 'index', 1 or 'columns'}", default 0
  6118. Axis to be sorted.
  6119. ascending : bool or list of bool, default True
  6120. Sort ascending vs. descending. Specify list for multiple sort
  6121. orders. If this is a list of bools, must match the length of
  6122. the by.
  6123. inplace : bool, default False
  6124. If True, perform operation in-place.
  6125. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
  6126. Choice of sorting algorithm. See also :func:`numpy.sort` for more
  6127. information. `mergesort` and `stable` are the only stable algorithms. For
  6128. DataFrames, this option is only applied when sorting on a single
  6129. column or label.
  6130. na_position : {'first', 'last'}, default 'last'
  6131. Puts NaNs at the beginning if `first`; `last` puts NaNs at the
  6132. end.
  6133. ignore_index : bool, default False
  6134. If True, the resulting axis will be labeled 0, 1, …, n - 1.
  6135. key : callable, optional
  6136. Apply the key function to the values
  6137. before sorting. This is similar to the `key` argument in the
  6138. builtin :meth:`sorted` function, with the notable difference that
  6139. this `key` function should be *vectorized*. It should expect a
  6140. ``Series`` and return a Series with the same shape as the input.
  6141. It will be applied to each column in `by` independently.
  6142. Returns
  6143. -------
  6144. DataFrame or None
  6145. DataFrame with sorted values or None if ``inplace=True``.
  6146. See Also
  6147. --------
  6148. DataFrame.sort_index : Sort a DataFrame by the index.
  6149. Series.sort_values : Similar method for a Series.
  6150. Examples
  6151. --------
  6152. >>> df = pd.DataFrame({
  6153. ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
  6154. ... 'col2': [2, 1, 9, 8, 7, 4],
  6155. ... 'col3': [0, 1, 9, 4, 2, 3],
  6156. ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
  6157. ... })
  6158. >>> df
  6159. col1 col2 col3 col4
  6160. 0 A 2 0 a
  6161. 1 A 1 1 B
  6162. 2 B 9 9 c
  6163. 3 NaN 8 4 D
  6164. 4 D 7 2 e
  6165. 5 C 4 3 F
  6166. Sort by col1
  6167. >>> df.sort_values(by=['col1'])
  6168. col1 col2 col3 col4
  6169. 0 A 2 0 a
  6170. 1 A 1 1 B
  6171. 2 B 9 9 c
  6172. 5 C 4 3 F
  6173. 4 D 7 2 e
  6174. 3 NaN 8 4 D
  6175. Sort by multiple columns
  6176. >>> df.sort_values(by=['col1', 'col2'])
  6177. col1 col2 col3 col4
  6178. 1 A 1 1 B
  6179. 0 A 2 0 a
  6180. 2 B 9 9 c
  6181. 5 C 4 3 F
  6182. 4 D 7 2 e
  6183. 3 NaN 8 4 D
  6184. Sort Descending
  6185. >>> df.sort_values(by='col1', ascending=False)
  6186. col1 col2 col3 col4
  6187. 4 D 7 2 e
  6188. 5 C 4 3 F
  6189. 2 B 9 9 c
  6190. 0 A 2 0 a
  6191. 1 A 1 1 B
  6192. 3 NaN 8 4 D
  6193. Putting NAs first
  6194. >>> df.sort_values(by='col1', ascending=False, na_position='first')
  6195. col1 col2 col3 col4
  6196. 3 NaN 8 4 D
  6197. 4 D 7 2 e
  6198. 5 C 4 3 F
  6199. 2 B 9 9 c
  6200. 0 A 2 0 a
  6201. 1 A 1 1 B
  6202. Sorting with a key function
  6203. >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
  6204. col1 col2 col3 col4
  6205. 0 A 2 0 a
  6206. 1 A 1 1 B
  6207. 2 B 9 9 c
  6208. 3 NaN 8 4 D
  6209. 4 D 7 2 e
  6210. 5 C 4 3 F
  6211. Natural sort with the key argument,
  6212. using the `natsort <https://github.com/SethMMorton/natsort>` package.
  6213. >>> df = pd.DataFrame({
  6214. ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
  6215. ... "value": [10, 20, 30, 40, 50]
  6216. ... })
  6217. >>> df
  6218. time value
  6219. 0 0hr 10
  6220. 1 128hr 20
  6221. 2 72hr 30
  6222. 3 48hr 40
  6223. 4 96hr 50
  6224. >>> from natsort import index_natsorted
  6225. >>> df.sort_values(
  6226. ... by="time",
  6227. ... key=lambda x: np.argsort(index_natsorted(df["time"]))
  6228. ... )
  6229. time value
  6230. 0 0hr 10
  6231. 3 48hr 40
  6232. 2 72hr 30
  6233. 4 96hr 50
  6234. 1 128hr 20
  6235. """
  6236. inplace = validate_bool_kwarg(inplace, "inplace")
  6237. axis = self._get_axis_number(axis)
  6238. ascending = validate_ascending(ascending)
  6239. if not isinstance(by, list):
  6240. by = [by]
  6241. # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
  6242. # expected "Sized"
  6243. if is_sequence(ascending) and (
  6244. len(by) != len(ascending) # type: ignore[arg-type]
  6245. ):
  6246. # error: Argument 1 to "len" has incompatible type "Union[bool,
  6247. # List[bool]]"; expected "Sized"
  6248. raise ValueError(
  6249. f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
  6250. f" != length of by ({len(by)})"
  6251. )
  6252. if len(by) > 1:
  6253. keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
  6254. # need to rewrap columns in Series to apply key function
  6255. if key is not None:
  6256. # error: List comprehension has incompatible type List[Series];
  6257. # expected List[ndarray]
  6258. keys = [
  6259. Series(k, name=name) # type: ignore[misc]
  6260. for (k, name) in zip(keys, by)
  6261. ]
  6262. indexer = lexsort_indexer(
  6263. keys, orders=ascending, na_position=na_position, key=key
  6264. )
  6265. elif len(by):
  6266. # len(by) == 1
  6267. k = self._get_label_or_level_values(by[0], axis=axis)
  6268. # need to rewrap column in Series to apply key function
  6269. if key is not None:
  6270. # error: Incompatible types in assignment (expression has type
  6271. # "Series", variable has type "ndarray")
  6272. k = Series(k, name=by[0]) # type: ignore[assignment]
  6273. if isinstance(ascending, (tuple, list)):
  6274. ascending = ascending[0]
  6275. indexer = nargsort(
  6276. k, kind=kind, ascending=ascending, na_position=na_position, key=key
  6277. )
  6278. else:
  6279. if inplace:
  6280. return self._update_inplace(self)
  6281. else:
  6282. return self.copy(deep=None)
  6283. if is_range_indexer(indexer, len(indexer)):
  6284. result = self.copy(deep=(not inplace and not using_copy_on_write()))
  6285. if ignore_index:
  6286. result.index = default_index(len(result))
  6287. if inplace:
  6288. return self._update_inplace(result)
  6289. else:
  6290. return result
  6291. new_data = self._mgr.take(
  6292. indexer, axis=self._get_block_manager_axis(axis), verify=False
  6293. )
  6294. if ignore_index:
  6295. new_data.set_axis(
  6296. self._get_block_manager_axis(axis), default_index(len(indexer))
  6297. )
  6298. result = self._constructor_from_mgr(new_data, axes=new_data.axes)
  6299. if inplace:
  6300. return self._update_inplace(result)
  6301. else:
  6302. return result.__finalize__(self, method="sort_values")
  6303. @overload
  6304. def sort_index(
  6305. self,
  6306. *,
  6307. axis: Axis = ...,
  6308. level: IndexLabel = ...,
  6309. ascending: bool | Sequence[bool] = ...,
  6310. inplace: Literal[True],
  6311. kind: SortKind = ...,
  6312. na_position: NaPosition = ...,
  6313. sort_remaining: bool = ...,
  6314. ignore_index: bool = ...,
  6315. key: IndexKeyFunc = ...,
  6316. ) -> None:
  6317. ...
  6318. @overload
  6319. def sort_index(
  6320. self,
  6321. *,
  6322. axis: Axis = ...,
  6323. level: IndexLabel = ...,
  6324. ascending: bool | Sequence[bool] = ...,
  6325. inplace: Literal[False] = ...,
  6326. kind: SortKind = ...,
  6327. na_position: NaPosition = ...,
  6328. sort_remaining: bool = ...,
  6329. ignore_index: bool = ...,
  6330. key: IndexKeyFunc = ...,
  6331. ) -> DataFrame:
  6332. ...
  6333. @overload
  6334. def sort_index(
  6335. self,
  6336. *,
  6337. axis: Axis = ...,
  6338. level: IndexLabel = ...,
  6339. ascending: bool | Sequence[bool] = ...,
  6340. inplace: bool = ...,
  6341. kind: SortKind = ...,
  6342. na_position: NaPosition = ...,
  6343. sort_remaining: bool = ...,
  6344. ignore_index: bool = ...,
  6345. key: IndexKeyFunc = ...,
  6346. ) -> DataFrame | None:
  6347. ...
  6348. def sort_index(
  6349. self,
  6350. *,
  6351. axis: Axis = 0,
  6352. level: IndexLabel | None = None,
  6353. ascending: bool | Sequence[bool] = True,
  6354. inplace: bool = False,
  6355. kind: SortKind = "quicksort",
  6356. na_position: NaPosition = "last",
  6357. sort_remaining: bool = True,
  6358. ignore_index: bool = False,
  6359. key: IndexKeyFunc | None = None,
  6360. ) -> DataFrame | None:
  6361. """
  6362. Sort object by labels (along an axis).
  6363. Returns a new DataFrame sorted by label if `inplace` argument is
  6364. ``False``, otherwise updates the original DataFrame and returns None.
  6365. Parameters
  6366. ----------
  6367. axis : {0 or 'index', 1 or 'columns'}, default 0
  6368. The axis along which to sort. The value 0 identifies the rows,
  6369. and 1 identifies the columns.
  6370. level : int or level name or list of ints or list of level names
  6371. If not None, sort on values in specified index level(s).
  6372. ascending : bool or list-like of bools, default True
  6373. Sort ascending vs. descending. When the index is a MultiIndex the
  6374. sort direction can be controlled for each level individually.
  6375. inplace : bool, default False
  6376. Whether to modify the DataFrame rather than creating a new one.
  6377. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
  6378. Choice of sorting algorithm. See also :func:`numpy.sort` for more
  6379. information. `mergesort` and `stable` are the only stable algorithms. For
  6380. DataFrames, this option is only applied when sorting on a single
  6381. column or label.
  6382. na_position : {'first', 'last'}, default 'last'
  6383. Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
  6384. Not implemented for MultiIndex.
  6385. sort_remaining : bool, default True
  6386. If True and sorting by level and index is multilevel, sort by other
  6387. levels too (in order) after sorting by specified level.
  6388. ignore_index : bool, default False
  6389. If True, the resulting axis will be labeled 0, 1, …, n - 1.
  6390. key : callable, optional
  6391. If not None, apply the key function to the index values
  6392. before sorting. This is similar to the `key` argument in the
  6393. builtin :meth:`sorted` function, with the notable difference that
  6394. this `key` function should be *vectorized*. It should expect an
  6395. ``Index`` and return an ``Index`` of the same shape. For MultiIndex
  6396. inputs, the key is applied *per level*.
  6397. Returns
  6398. -------
  6399. DataFrame or None
  6400. The original DataFrame sorted by the labels or None if ``inplace=True``.
  6401. See Also
  6402. --------
  6403. Series.sort_index : Sort Series by the index.
  6404. DataFrame.sort_values : Sort DataFrame by the value.
  6405. Series.sort_values : Sort Series by the value.
  6406. Examples
  6407. --------
  6408. >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
  6409. ... columns=['A'])
  6410. >>> df.sort_index()
  6411. A
  6412. 1 4
  6413. 29 2
  6414. 100 1
  6415. 150 5
  6416. 234 3
  6417. By default, it sorts in ascending order, to sort in descending order,
  6418. use ``ascending=False``
  6419. >>> df.sort_index(ascending=False)
  6420. A
  6421. 234 3
  6422. 150 5
  6423. 100 1
  6424. 29 2
  6425. 1 4
  6426. A key function can be specified which is applied to the index before
  6427. sorting. For a ``MultiIndex`` this is applied to each level separately.
  6428. >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
  6429. >>> df.sort_index(key=lambda x: x.str.lower())
  6430. a
  6431. A 1
  6432. b 2
  6433. C 3
  6434. d 4
  6435. """
  6436. return super().sort_index(
  6437. axis=axis,
  6438. level=level,
  6439. ascending=ascending,
  6440. inplace=inplace,
  6441. kind=kind,
  6442. na_position=na_position,
  6443. sort_remaining=sort_remaining,
  6444. ignore_index=ignore_index,
  6445. key=key,
  6446. )
  6447. def value_counts(
  6448. self,
  6449. subset: IndexLabel | None = None,
  6450. normalize: bool = False,
  6451. sort: bool = True,
  6452. ascending: bool = False,
  6453. dropna: bool = True,
  6454. ) -> Series:
  6455. """
  6456. Return a Series containing the frequency of each distinct row in the Dataframe.
  6457. Parameters
  6458. ----------
  6459. subset : label or list of labels, optional
  6460. Columns to use when counting unique combinations.
  6461. normalize : bool, default False
  6462. Return proportions rather than frequencies.
  6463. sort : bool, default True
  6464. Sort by frequencies when True. Sort by DataFrame column values when False.
  6465. ascending : bool, default False
  6466. Sort in ascending order.
  6467. dropna : bool, default True
  6468. Don't include counts of rows that contain NA values.
  6469. .. versionadded:: 1.3.0
  6470. Returns
  6471. -------
  6472. Series
  6473. See Also
  6474. --------
  6475. Series.value_counts: Equivalent method on Series.
  6476. Notes
  6477. -----
  6478. The returned Series will have a MultiIndex with one level per input
  6479. column but an Index (non-multi) for a single label. By default, rows
  6480. that contain any NA values are omitted from the result. By default,
  6481. the resulting Series will be in descending order so that the first
  6482. element is the most frequently-occurring row.
  6483. Examples
  6484. --------
  6485. >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
  6486. ... 'num_wings': [2, 0, 0, 0]},
  6487. ... index=['falcon', 'dog', 'cat', 'ant'])
  6488. >>> df
  6489. num_legs num_wings
  6490. falcon 2 2
  6491. dog 4 0
  6492. cat 4 0
  6493. ant 6 0
  6494. >>> df.value_counts()
  6495. num_legs num_wings
  6496. 4 0 2
  6497. 2 2 1
  6498. 6 0 1
  6499. Name: count, dtype: int64
  6500. >>> df.value_counts(sort=False)
  6501. num_legs num_wings
  6502. 2 2 1
  6503. 4 0 2
  6504. 6 0 1
  6505. Name: count, dtype: int64
  6506. >>> df.value_counts(ascending=True)
  6507. num_legs num_wings
  6508. 2 2 1
  6509. 6 0 1
  6510. 4 0 2
  6511. Name: count, dtype: int64
  6512. >>> df.value_counts(normalize=True)
  6513. num_legs num_wings
  6514. 4 0 0.50
  6515. 2 2 0.25
  6516. 6 0 0.25
  6517. Name: proportion, dtype: float64
  6518. With `dropna` set to `False` we can also count rows with NA values.
  6519. >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
  6520. ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
  6521. >>> df
  6522. first_name middle_name
  6523. 0 John Smith
  6524. 1 Anne <NA>
  6525. 2 John <NA>
  6526. 3 Beth Louise
  6527. >>> df.value_counts()
  6528. first_name middle_name
  6529. Beth Louise 1
  6530. John Smith 1
  6531. Name: count, dtype: int64
  6532. >>> df.value_counts(dropna=False)
  6533. first_name middle_name
  6534. Anne NaN 1
  6535. Beth Louise 1
  6536. John Smith 1
  6537. NaN 1
  6538. Name: count, dtype: int64
  6539. >>> df.value_counts("first_name")
  6540. first_name
  6541. John 2
  6542. Anne 1
  6543. Beth 1
  6544. Name: count, dtype: int64
  6545. """
  6546. if subset is None:
  6547. subset = self.columns.tolist()
  6548. name = "proportion" if normalize else "count"
  6549. counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
  6550. counts.name = name
  6551. if sort:
  6552. counts = counts.sort_values(ascending=ascending)
  6553. if normalize:
  6554. counts /= counts.sum()
  6555. # Force MultiIndex for a list_like subset with a single column
  6556. if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type]
  6557. counts.index = MultiIndex.from_arrays(
  6558. [counts.index], names=[counts.index.name]
  6559. )
  6560. return counts
  6561. def nlargest(
  6562. self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
  6563. ) -> DataFrame:
  6564. """
  6565. Return the first `n` rows ordered by `columns` in descending order.
  6566. Return the first `n` rows with the largest values in `columns`, in
  6567. descending order. The columns that are not specified are returned as
  6568. well, but not used for ordering.
  6569. This method is equivalent to
  6570. ``df.sort_values(columns, ascending=False).head(n)``, but more
  6571. performant.
  6572. Parameters
  6573. ----------
  6574. n : int
  6575. Number of rows to return.
  6576. columns : label or list of labels
  6577. Column label(s) to order by.
  6578. keep : {'first', 'last', 'all'}, default 'first'
  6579. Where there are duplicate values:
  6580. - ``first`` : prioritize the first occurrence(s)
  6581. - ``last`` : prioritize the last occurrence(s)
  6582. - ``all`` : keep all the ties of the smallest item even if it means
  6583. selecting more than ``n`` items.
  6584. Returns
  6585. -------
  6586. DataFrame
  6587. The first `n` rows ordered by the given columns in descending
  6588. order.
  6589. See Also
  6590. --------
  6591. DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
  6592. ascending order.
  6593. DataFrame.sort_values : Sort DataFrame by the values.
  6594. DataFrame.head : Return the first `n` rows without re-ordering.
  6595. Notes
  6596. -----
  6597. This function cannot be used with all column types. For example, when
  6598. specifying columns with `object` or `category` dtypes, ``TypeError`` is
  6599. raised.
  6600. Examples
  6601. --------
  6602. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  6603. ... 434000, 434000, 337000, 11300,
  6604. ... 11300, 11300],
  6605. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  6606. ... 17036, 182, 38, 311],
  6607. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  6608. ... "IS", "NR", "TV", "AI"]},
  6609. ... index=["Italy", "France", "Malta",
  6610. ... "Maldives", "Brunei", "Iceland",
  6611. ... "Nauru", "Tuvalu", "Anguilla"])
  6612. >>> df
  6613. population GDP alpha-2
  6614. Italy 59000000 1937894 IT
  6615. France 65000000 2583560 FR
  6616. Malta 434000 12011 MT
  6617. Maldives 434000 4520 MV
  6618. Brunei 434000 12128 BN
  6619. Iceland 337000 17036 IS
  6620. Nauru 11300 182 NR
  6621. Tuvalu 11300 38 TV
  6622. Anguilla 11300 311 AI
  6623. In the following example, we will use ``nlargest`` to select the three
  6624. rows having the largest values in column "population".
  6625. >>> df.nlargest(3, 'population')
  6626. population GDP alpha-2
  6627. France 65000000 2583560 FR
  6628. Italy 59000000 1937894 IT
  6629. Malta 434000 12011 MT
  6630. When using ``keep='last'``, ties are resolved in reverse order:
  6631. >>> df.nlargest(3, 'population', keep='last')
  6632. population GDP alpha-2
  6633. France 65000000 2583560 FR
  6634. Italy 59000000 1937894 IT
  6635. Brunei 434000 12128 BN
  6636. When using ``keep='all'``, the number of element kept can go beyond ``n``
  6637. if there are duplicate values for the smallest element, all the
  6638. ties are kept:
  6639. >>> df.nlargest(3, 'population', keep='all')
  6640. population GDP alpha-2
  6641. France 65000000 2583560 FR
  6642. Italy 59000000 1937894 IT
  6643. Malta 434000 12011 MT
  6644. Maldives 434000 4520 MV
  6645. Brunei 434000 12128 BN
  6646. However, ``nlargest`` does not keep ``n`` distinct largest elements:
  6647. >>> df.nlargest(5, 'population', keep='all')
  6648. population GDP alpha-2
  6649. France 65000000 2583560 FR
  6650. Italy 59000000 1937894 IT
  6651. Malta 434000 12011 MT
  6652. Maldives 434000 4520 MV
  6653. Brunei 434000 12128 BN
  6654. To order by the largest values in column "population" and then "GDP",
  6655. we can specify multiple columns like in the next example.
  6656. >>> df.nlargest(3, ['population', 'GDP'])
  6657. population GDP alpha-2
  6658. France 65000000 2583560 FR
  6659. Italy 59000000 1937894 IT
  6660. Brunei 434000 12128 BN
  6661. """
  6662. return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
  6663. def nsmallest(
  6664. self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
  6665. ) -> DataFrame:
  6666. """
  6667. Return the first `n` rows ordered by `columns` in ascending order.
  6668. Return the first `n` rows with the smallest values in `columns`, in
  6669. ascending order. The columns that are not specified are returned as
  6670. well, but not used for ordering.
  6671. This method is equivalent to
  6672. ``df.sort_values(columns, ascending=True).head(n)``, but more
  6673. performant.
  6674. Parameters
  6675. ----------
  6676. n : int
  6677. Number of items to retrieve.
  6678. columns : list or str
  6679. Column name or names to order by.
  6680. keep : {'first', 'last', 'all'}, default 'first'
  6681. Where there are duplicate values:
  6682. - ``first`` : take the first occurrence.
  6683. - ``last`` : take the last occurrence.
  6684. - ``all`` : keep all the ties of the largest item even if it means
  6685. selecting more than ``n`` items.
  6686. Returns
  6687. -------
  6688. DataFrame
  6689. See Also
  6690. --------
  6691. DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
  6692. descending order.
  6693. DataFrame.sort_values : Sort DataFrame by the values.
  6694. DataFrame.head : Return the first `n` rows without re-ordering.
  6695. Examples
  6696. --------
  6697. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  6698. ... 434000, 434000, 337000, 337000,
  6699. ... 11300, 11300],
  6700. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  6701. ... 17036, 182, 38, 311],
  6702. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  6703. ... "IS", "NR", "TV", "AI"]},
  6704. ... index=["Italy", "France", "Malta",
  6705. ... "Maldives", "Brunei", "Iceland",
  6706. ... "Nauru", "Tuvalu", "Anguilla"])
  6707. >>> df
  6708. population GDP alpha-2
  6709. Italy 59000000 1937894 IT
  6710. France 65000000 2583560 FR
  6711. Malta 434000 12011 MT
  6712. Maldives 434000 4520 MV
  6713. Brunei 434000 12128 BN
  6714. Iceland 337000 17036 IS
  6715. Nauru 337000 182 NR
  6716. Tuvalu 11300 38 TV
  6717. Anguilla 11300 311 AI
  6718. In the following example, we will use ``nsmallest`` to select the
  6719. three rows having the smallest values in column "population".
  6720. >>> df.nsmallest(3, 'population')
  6721. population GDP alpha-2
  6722. Tuvalu 11300 38 TV
  6723. Anguilla 11300 311 AI
  6724. Iceland 337000 17036 IS
  6725. When using ``keep='last'``, ties are resolved in reverse order:
  6726. >>> df.nsmallest(3, 'population', keep='last')
  6727. population GDP alpha-2
  6728. Anguilla 11300 311 AI
  6729. Tuvalu 11300 38 TV
  6730. Nauru 337000 182 NR
  6731. When using ``keep='all'``, the number of element kept can go beyond ``n``
  6732. if there are duplicate values for the largest element, all the
  6733. ties are kept.
  6734. >>> df.nsmallest(3, 'population', keep='all')
  6735. population GDP alpha-2
  6736. Tuvalu 11300 38 TV
  6737. Anguilla 11300 311 AI
  6738. Iceland 337000 17036 IS
  6739. Nauru 337000 182 NR
  6740. However, ``nsmallest`` does not keep ``n`` distinct
  6741. smallest elements:
  6742. >>> df.nsmallest(4, 'population', keep='all')
  6743. population GDP alpha-2
  6744. Tuvalu 11300 38 TV
  6745. Anguilla 11300 311 AI
  6746. Iceland 337000 17036 IS
  6747. Nauru 337000 182 NR
  6748. To order by the smallest values in column "population" and then "GDP", we can
  6749. specify multiple columns like in the next example.
  6750. >>> df.nsmallest(3, ['population', 'GDP'])
  6751. population GDP alpha-2
  6752. Tuvalu 11300 38 TV
  6753. Anguilla 11300 311 AI
  6754. Nauru 337000 182 NR
  6755. """
  6756. return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
  6757. @doc(
  6758. Series.swaplevel,
  6759. klass=_shared_doc_kwargs["klass"],
  6760. extra_params=dedent(
  6761. """axis : {0 or 'index', 1 or 'columns'}, default 0
  6762. The axis to swap levels on. 0 or 'index' for row-wise, 1 or
  6763. 'columns' for column-wise."""
  6764. ),
  6765. examples=dedent(
  6766. """\
  6767. Examples
  6768. --------
  6769. >>> df = pd.DataFrame(
  6770. ... {"Grade": ["A", "B", "A", "C"]},
  6771. ... index=[
  6772. ... ["Final exam", "Final exam", "Coursework", "Coursework"],
  6773. ... ["History", "Geography", "History", "Geography"],
  6774. ... ["January", "February", "March", "April"],
  6775. ... ],
  6776. ... )
  6777. >>> df
  6778. Grade
  6779. Final exam History January A
  6780. Geography February B
  6781. Coursework History March A
  6782. Geography April C
  6783. In the following example, we will swap the levels of the indices.
  6784. Here, we will swap the levels column-wise, but levels can be swapped row-wise
  6785. in a similar manner. Note that column-wise is the default behaviour.
  6786. By not supplying any arguments for i and j, we swap the last and second to
  6787. last indices.
  6788. >>> df.swaplevel()
  6789. Grade
  6790. Final exam January History A
  6791. February Geography B
  6792. Coursework March History A
  6793. April Geography C
  6794. By supplying one argument, we can choose which index to swap the last
  6795. index with. We can for example swap the first index with the last one as
  6796. follows.
  6797. >>> df.swaplevel(0)
  6798. Grade
  6799. January History Final exam A
  6800. February Geography Final exam B
  6801. March History Coursework A
  6802. April Geography Coursework C
  6803. We can also define explicitly which indices we want to swap by supplying values
  6804. for both i and j. Here, we for example swap the first and second indices.
  6805. >>> df.swaplevel(0, 1)
  6806. Grade
  6807. History Final exam January A
  6808. Geography Final exam February B
  6809. History Coursework March A
  6810. Geography Coursework April C"""
  6811. ),
  6812. )
  6813. def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
  6814. result = self.copy(deep=None)
  6815. axis = self._get_axis_number(axis)
  6816. if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
  6817. raise TypeError("Can only swap levels on a hierarchical axis.")
  6818. if axis == 0:
  6819. assert isinstance(result.index, MultiIndex)
  6820. result.index = result.index.swaplevel(i, j)
  6821. else:
  6822. assert isinstance(result.columns, MultiIndex)
  6823. result.columns = result.columns.swaplevel(i, j)
  6824. return result
  6825. def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
  6826. """
  6827. Rearrange index levels using input order. May not drop or duplicate levels.
  6828. Parameters
  6829. ----------
  6830. order : list of int or list of str
  6831. List representing new level order. Reference level by number
  6832. (position) or by key (label).
  6833. axis : {0 or 'index', 1 or 'columns'}, default 0
  6834. Where to reorder levels.
  6835. Returns
  6836. -------
  6837. DataFrame
  6838. Examples
  6839. --------
  6840. >>> data = {
  6841. ... "class": ["Mammals", "Mammals", "Reptiles"],
  6842. ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
  6843. ... "species": ["Humans", "Dogs", "Snakes"],
  6844. ... }
  6845. >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
  6846. >>> df = df.set_index(["class", "diet"])
  6847. >>> df
  6848. species
  6849. class diet
  6850. Mammals Omnivore Humans
  6851. Carnivore Dogs
  6852. Reptiles Carnivore Snakes
  6853. Let's reorder the levels of the index:
  6854. >>> df.reorder_levels(["diet", "class"])
  6855. species
  6856. diet class
  6857. Omnivore Mammals Humans
  6858. Carnivore Mammals Dogs
  6859. Reptiles Snakes
  6860. """
  6861. axis = self._get_axis_number(axis)
  6862. if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
  6863. raise TypeError("Can only reorder levels on a hierarchical axis.")
  6864. result = self.copy(deep=None)
  6865. if axis == 0:
  6866. assert isinstance(result.index, MultiIndex)
  6867. result.index = result.index.reorder_levels(order)
  6868. else:
  6869. assert isinstance(result.columns, MultiIndex)
  6870. result.columns = result.columns.reorder_levels(order)
  6871. return result
  6872. # ----------------------------------------------------------------------
  6873. # Arithmetic Methods
  6874. def _cmp_method(self, other, op):
  6875. axis: Literal[1] = 1 # only relevant for Series other case
  6876. self, other = self._align_for_op(other, axis, flex=False, level=None)
  6877. # See GH#4537 for discussion of scalar op behavior
  6878. new_data = self._dispatch_frame_op(other, op, axis=axis)
  6879. return self._construct_result(new_data)
  6880. def _arith_method(self, other, op):
  6881. if self._should_reindex_frame_op(other, op, 1, None, None):
  6882. return self._arith_method_with_reindex(other, op)
  6883. axis: Literal[1] = 1 # only relevant for Series other case
  6884. other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
  6885. self, other = self._align_for_op(other, axis, flex=True, level=None)
  6886. with np.errstate(all="ignore"):
  6887. new_data = self._dispatch_frame_op(other, op, axis=axis)
  6888. return self._construct_result(new_data)
  6889. _logical_method = _arith_method
  6890. def _dispatch_frame_op(
  6891. self, right, func: Callable, axis: AxisInt | None = None
  6892. ) -> DataFrame:
  6893. """
  6894. Evaluate the frame operation func(left, right) by evaluating
  6895. column-by-column, dispatching to the Series implementation.
  6896. Parameters
  6897. ----------
  6898. right : scalar, Series, or DataFrame
  6899. func : arithmetic or comparison operator
  6900. axis : {None, 0, 1}
  6901. Returns
  6902. -------
  6903. DataFrame
  6904. Notes
  6905. -----
  6906. Caller is responsible for setting np.errstate where relevant.
  6907. """
  6908. # Get the appropriate array-op to apply to each column/block's values.
  6909. array_op = ops.get_array_op(func)
  6910. right = lib.item_from_zerodim(right)
  6911. if not is_list_like(right):
  6912. # i.e. scalar, faster than checking np.ndim(right) == 0
  6913. bm = self._mgr.apply(array_op, right=right)
  6914. return self._constructor_from_mgr(bm, axes=bm.axes)
  6915. elif isinstance(right, DataFrame):
  6916. assert self.index.equals(right.index)
  6917. assert self.columns.equals(right.columns)
  6918. # TODO: The previous assertion `assert right._indexed_same(self)`
  6919. # fails in cases with empty columns reached via
  6920. # _frame_arith_method_with_reindex
  6921. # TODO operate_blockwise expects a manager of the same type
  6922. bm = self._mgr.operate_blockwise(
  6923. # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
  6924. # incompatible type "Union[ArrayManager, BlockManager]"; expected
  6925. # "ArrayManager"
  6926. # error: Argument 1 to "operate_blockwise" of "BlockManager" has
  6927. # incompatible type "Union[ArrayManager, BlockManager]"; expected
  6928. # "BlockManager"
  6929. right._mgr, # type: ignore[arg-type]
  6930. array_op,
  6931. )
  6932. return self._constructor_from_mgr(bm, axes=bm.axes)
  6933. elif isinstance(right, Series) and axis == 1:
  6934. # axis=1 means we want to operate row-by-row
  6935. assert right.index.equals(self.columns)
  6936. right = right._values
  6937. # maybe_align_as_frame ensures we do not have an ndarray here
  6938. assert not isinstance(right, np.ndarray)
  6939. arrays = [
  6940. array_op(_left, _right)
  6941. for _left, _right in zip(self._iter_column_arrays(), right)
  6942. ]
  6943. elif isinstance(right, Series):
  6944. assert right.index.equals(self.index)
  6945. right = right._values
  6946. arrays = [array_op(left, right) for left in self._iter_column_arrays()]
  6947. else:
  6948. raise NotImplementedError(right)
  6949. return type(self)._from_arrays(
  6950. arrays, self.columns, self.index, verify_integrity=False
  6951. )
  6952. def _combine_frame(self, other: DataFrame, func, fill_value=None):
  6953. # at this point we have `self._indexed_same(other)`
  6954. if fill_value is None:
  6955. # since _arith_op may be called in a loop, avoid function call
  6956. # overhead if possible by doing this check once
  6957. _arith_op = func
  6958. else:
  6959. def _arith_op(left, right):
  6960. # for the mixed_type case where we iterate over columns,
  6961. # _arith_op(left, right) is equivalent to
  6962. # left._binop(right, func, fill_value=fill_value)
  6963. left, right = ops.fill_binop(left, right, fill_value)
  6964. return func(left, right)
  6965. new_data = self._dispatch_frame_op(other, _arith_op)
  6966. return new_data
  6967. def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:
  6968. """
  6969. For DataFrame-with-DataFrame operations that require reindexing,
  6970. operate only on shared columns, then reindex.
  6971. Parameters
  6972. ----------
  6973. right : DataFrame
  6974. op : binary operator
  6975. Returns
  6976. -------
  6977. DataFrame
  6978. """
  6979. left = self
  6980. # GH#31623, only operate on shared columns
  6981. cols, lcols, rcols = left.columns.join(
  6982. right.columns, how="inner", level=None, return_indexers=True
  6983. )
  6984. new_left = left.iloc[:, lcols]
  6985. new_right = right.iloc[:, rcols]
  6986. result = op(new_left, new_right)
  6987. # Do the join on the columns instead of using left._align_for_op
  6988. # to avoid constructing two potentially large/sparse DataFrames
  6989. join_columns, _, _ = left.columns.join(
  6990. right.columns, how="outer", level=None, return_indexers=True
  6991. )
  6992. if result.columns.has_duplicates:
  6993. # Avoid reindexing with a duplicate axis.
  6994. # https://github.com/pandas-dev/pandas/issues/35194
  6995. indexer, _ = result.columns.get_indexer_non_unique(join_columns)
  6996. indexer = algorithms.unique1d(indexer)
  6997. result = result._reindex_with_indexers(
  6998. {1: [join_columns, indexer]}, allow_dups=True
  6999. )
  7000. else:
  7001. result = result.reindex(join_columns, axis=1)
  7002. return result
  7003. def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool:
  7004. """
  7005. Check if this is an operation between DataFrames that will need to reindex.
  7006. """
  7007. if op is operator.pow or op is roperator.rpow:
  7008. # GH#32685 pow has special semantics for operating with null values
  7009. return False
  7010. if not isinstance(right, DataFrame):
  7011. return False
  7012. if fill_value is None and level is None and axis == 1:
  7013. # TODO: any other cases we should handle here?
  7014. # Intersection is always unique so we have to check the unique columns
  7015. left_uniques = self.columns.unique()
  7016. right_uniques = right.columns.unique()
  7017. cols = left_uniques.intersection(right_uniques)
  7018. if len(cols) and not (
  7019. len(cols) == len(left_uniques) and len(cols) == len(right_uniques)
  7020. ):
  7021. # TODO: is there a shortcut available when len(cols) == 0?
  7022. return True
  7023. return False
  7024. def _align_for_op(
  7025. self,
  7026. other,
  7027. axis: AxisInt,
  7028. flex: bool | None = False,
  7029. level: Level | None = None,
  7030. ):
  7031. """
  7032. Convert rhs to meet lhs dims if input is list, tuple or np.ndarray.
  7033. Parameters
  7034. ----------
  7035. left : DataFrame
  7036. right : Any
  7037. axis : int
  7038. flex : bool or None, default False
  7039. Whether this is a flex op, in which case we reindex.
  7040. None indicates not to check for alignment.
  7041. level : int or level name, default None
  7042. Returns
  7043. -------
  7044. left : DataFrame
  7045. right : Any
  7046. """
  7047. left, right = self, other
  7048. def to_series(right):
  7049. msg = (
  7050. "Unable to coerce to Series, "
  7051. "length must be {req_len}: given {given_len}"
  7052. )
  7053. # pass dtype to avoid doing inference, which would break consistency
  7054. # with Index/Series ops
  7055. dtype = None
  7056. if getattr(right, "dtype", None) == object:
  7057. # can't pass right.dtype unconditionally as that would break on e.g.
  7058. # datetime64[h] ndarray
  7059. dtype = object
  7060. if axis == 0:
  7061. if len(left.index) != len(right):
  7062. raise ValueError(
  7063. msg.format(req_len=len(left.index), given_len=len(right))
  7064. )
  7065. right = left._constructor_sliced(right, index=left.index, dtype=dtype)
  7066. else:
  7067. if len(left.columns) != len(right):
  7068. raise ValueError(
  7069. msg.format(req_len=len(left.columns), given_len=len(right))
  7070. )
  7071. right = left._constructor_sliced(right, index=left.columns, dtype=dtype)
  7072. return right
  7073. if isinstance(right, np.ndarray):
  7074. if right.ndim == 1:
  7075. right = to_series(right)
  7076. elif right.ndim == 2:
  7077. # We need to pass dtype=right.dtype to retain object dtype
  7078. # otherwise we lose consistency with Index and array ops
  7079. dtype = None
  7080. if right.dtype == object:
  7081. # can't pass right.dtype unconditionally as that would break on e.g.
  7082. # datetime64[h] ndarray
  7083. dtype = object
  7084. if right.shape == left.shape:
  7085. right = left._constructor(
  7086. right, index=left.index, columns=left.columns, dtype=dtype
  7087. )
  7088. elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
  7089. # Broadcast across columns
  7090. right = np.broadcast_to(right, left.shape)
  7091. right = left._constructor(
  7092. right, index=left.index, columns=left.columns, dtype=dtype
  7093. )
  7094. elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
  7095. # Broadcast along rows
  7096. right = to_series(right[0, :])
  7097. else:
  7098. raise ValueError(
  7099. "Unable to coerce to DataFrame, shape "
  7100. f"must be {left.shape}: given {right.shape}"
  7101. )
  7102. elif right.ndim > 2:
  7103. raise ValueError(
  7104. "Unable to coerce to Series/DataFrame, "
  7105. f"dimension must be <= 2: {right.shape}"
  7106. )
  7107. elif is_list_like(right) and not isinstance(right, (Series, DataFrame)):
  7108. # GH#36702. Raise when attempting arithmetic with list of array-like.
  7109. if any(is_array_like(el) for el in right):
  7110. raise ValueError(
  7111. f"Unable to coerce list of {type(right[0])} to Series/DataFrame"
  7112. )
  7113. # GH#17901
  7114. right = to_series(right)
  7115. if flex is not None and isinstance(right, DataFrame):
  7116. if not left._indexed_same(right):
  7117. if flex:
  7118. left, right = left.align(
  7119. right, join="outer", level=level, copy=False
  7120. )
  7121. else:
  7122. raise ValueError(
  7123. "Can only compare identically-labeled (both index and columns) "
  7124. "DataFrame objects"
  7125. )
  7126. elif isinstance(right, Series):
  7127. # axis=1 is default for DataFrame-with-Series op
  7128. axis = axis if axis is not None else 1
  7129. if not flex:
  7130. if not left.axes[axis].equals(right.index):
  7131. raise ValueError(
  7132. "Operands are not aligned. Do "
  7133. "`left, right = left.align(right, axis=1, copy=False)` "
  7134. "before operating."
  7135. )
  7136. left, right = left.align(
  7137. right,
  7138. join="outer",
  7139. axis=axis,
  7140. level=level,
  7141. copy=False,
  7142. )
  7143. right = left._maybe_align_series_as_frame(right, axis)
  7144. return left, right
  7145. def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt):
  7146. """
  7147. If the Series operand is not EA-dtype, we can broadcast to 2D and operate
  7148. blockwise.
  7149. """
  7150. rvalues = series._values
  7151. if not isinstance(rvalues, np.ndarray):
  7152. # TODO(EA2D): no need to special-case with 2D EAs
  7153. if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"):
  7154. # We can losslessly+cheaply cast to ndarray
  7155. rvalues = np.asarray(rvalues)
  7156. else:
  7157. return series
  7158. if axis == 0:
  7159. rvalues = rvalues.reshape(-1, 1)
  7160. else:
  7161. rvalues = rvalues.reshape(1, -1)
  7162. rvalues = np.broadcast_to(rvalues, self.shape)
  7163. # pass dtype to avoid doing inference
  7164. return self._constructor(
  7165. rvalues,
  7166. index=self.index,
  7167. columns=self.columns,
  7168. dtype=rvalues.dtype,
  7169. )
  7170. def _flex_arith_method(
  7171. self, other, op, *, axis: Axis = "columns", level=None, fill_value=None
  7172. ):
  7173. axis = self._get_axis_number(axis) if axis is not None else 1
  7174. if self._should_reindex_frame_op(other, op, axis, fill_value, level):
  7175. return self._arith_method_with_reindex(other, op)
  7176. if isinstance(other, Series) and fill_value is not None:
  7177. # TODO: We could allow this in cases where we end up going
  7178. # through the DataFrame path
  7179. raise NotImplementedError(f"fill_value {fill_value} not supported.")
  7180. other = ops.maybe_prepare_scalar_for_op(other, self.shape)
  7181. self, other = self._align_for_op(other, axis, flex=True, level=level)
  7182. with np.errstate(all="ignore"):
  7183. if isinstance(other, DataFrame):
  7184. # Another DataFrame
  7185. new_data = self._combine_frame(other, op, fill_value)
  7186. elif isinstance(other, Series):
  7187. new_data = self._dispatch_frame_op(other, op, axis=axis)
  7188. else:
  7189. # in this case we always have `np.ndim(other) == 0`
  7190. if fill_value is not None:
  7191. self = self.fillna(fill_value)
  7192. new_data = self._dispatch_frame_op(other, op)
  7193. return self._construct_result(new_data)
  7194. def _construct_result(self, result) -> DataFrame:
  7195. """
  7196. Wrap the result of an arithmetic, comparison, or logical operation.
  7197. Parameters
  7198. ----------
  7199. result : DataFrame
  7200. Returns
  7201. -------
  7202. DataFrame
  7203. """
  7204. out = self._constructor(result, copy=False).__finalize__(self)
  7205. # Pin columns instead of passing to constructor for compat with
  7206. # non-unique columns case
  7207. out.columns = self.columns
  7208. out.index = self.index
  7209. return out
  7210. def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
  7211. # Naive implementation, room for optimization
  7212. div = self // other
  7213. mod = self - div * other
  7214. return div, mod
  7215. def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
  7216. # Naive implementation, room for optimization
  7217. div = other // self
  7218. mod = other - div * self
  7219. return div, mod
  7220. def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None):
  7221. axis = self._get_axis_number(axis) if axis is not None else 1
  7222. self, other = self._align_for_op(other, axis, flex=True, level=level)
  7223. new_data = self._dispatch_frame_op(other, op, axis=axis)
  7224. return self._construct_result(new_data)
  7225. @Appender(ops.make_flex_doc("eq", "dataframe"))
  7226. def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame:
  7227. return self._flex_cmp_method(other, operator.eq, axis=axis, level=level)
  7228. @Appender(ops.make_flex_doc("ne", "dataframe"))
  7229. def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame:
  7230. return self._flex_cmp_method(other, operator.ne, axis=axis, level=level)
  7231. @Appender(ops.make_flex_doc("le", "dataframe"))
  7232. def le(self, other, axis: Axis = "columns", level=None) -> DataFrame:
  7233. return self._flex_cmp_method(other, operator.le, axis=axis, level=level)
  7234. @Appender(ops.make_flex_doc("lt", "dataframe"))
  7235. def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame:
  7236. return self._flex_cmp_method(other, operator.lt, axis=axis, level=level)
  7237. @Appender(ops.make_flex_doc("ge", "dataframe"))
  7238. def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame:
  7239. return self._flex_cmp_method(other, operator.ge, axis=axis, level=level)
  7240. @Appender(ops.make_flex_doc("gt", "dataframe"))
  7241. def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame:
  7242. return self._flex_cmp_method(other, operator.gt, axis=axis, level=level)
  7243. @Appender(ops.make_flex_doc("add", "dataframe"))
  7244. def add(
  7245. self, other, axis: Axis = "columns", level=None, fill_value=None
  7246. ) -> DataFrame:
  7247. return self._flex_arith_method(
  7248. other, operator.add, level=level, fill_value=fill_value, axis=axis
  7249. )
  7250. @Appender(ops.make_flex_doc("radd", "dataframe"))
  7251. def radd(
  7252. self, other, axis: Axis = "columns", level=None, fill_value=None
  7253. ) -> DataFrame:
  7254. return self._flex_arith_method(
  7255. other, roperator.radd, level=level, fill_value=fill_value, axis=axis
  7256. )
  7257. @Appender(ops.make_flex_doc("sub", "dataframe"))
  7258. def sub(
  7259. self, other, axis: Axis = "columns", level=None, fill_value=None
  7260. ) -> DataFrame:
  7261. return self._flex_arith_method(
  7262. other, operator.sub, level=level, fill_value=fill_value, axis=axis
  7263. )
  7264. subtract = sub
  7265. @Appender(ops.make_flex_doc("rsub", "dataframe"))
  7266. def rsub(
  7267. self, other, axis: Axis = "columns", level=None, fill_value=None
  7268. ) -> DataFrame:
  7269. return self._flex_arith_method(
  7270. other, roperator.rsub, level=level, fill_value=fill_value, axis=axis
  7271. )
  7272. @Appender(ops.make_flex_doc("mul", "dataframe"))
  7273. def mul(
  7274. self, other, axis: Axis = "columns", level=None, fill_value=None
  7275. ) -> DataFrame:
  7276. return self._flex_arith_method(
  7277. other, operator.mul, level=level, fill_value=fill_value, axis=axis
  7278. )
  7279. multiply = mul
  7280. @Appender(ops.make_flex_doc("rmul", "dataframe"))
  7281. def rmul(
  7282. self, other, axis: Axis = "columns", level=None, fill_value=None
  7283. ) -> DataFrame:
  7284. return self._flex_arith_method(
  7285. other, roperator.rmul, level=level, fill_value=fill_value, axis=axis
  7286. )
  7287. @Appender(ops.make_flex_doc("truediv", "dataframe"))
  7288. def truediv(
  7289. self, other, axis: Axis = "columns", level=None, fill_value=None
  7290. ) -> DataFrame:
  7291. return self._flex_arith_method(
  7292. other, operator.truediv, level=level, fill_value=fill_value, axis=axis
  7293. )
  7294. div = truediv
  7295. divide = truediv
  7296. @Appender(ops.make_flex_doc("rtruediv", "dataframe"))
  7297. def rtruediv(
  7298. self, other, axis: Axis = "columns", level=None, fill_value=None
  7299. ) -> DataFrame:
  7300. return self._flex_arith_method(
  7301. other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis
  7302. )
  7303. rdiv = rtruediv
  7304. @Appender(ops.make_flex_doc("floordiv", "dataframe"))
  7305. def floordiv(
  7306. self, other, axis: Axis = "columns", level=None, fill_value=None
  7307. ) -> DataFrame:
  7308. return self._flex_arith_method(
  7309. other, operator.floordiv, level=level, fill_value=fill_value, axis=axis
  7310. )
  7311. @Appender(ops.make_flex_doc("rfloordiv", "dataframe"))
  7312. def rfloordiv(
  7313. self, other, axis: Axis = "columns", level=None, fill_value=None
  7314. ) -> DataFrame:
  7315. return self._flex_arith_method(
  7316. other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis
  7317. )
  7318. @Appender(ops.make_flex_doc("mod", "dataframe"))
  7319. def mod(
  7320. self, other, axis: Axis = "columns", level=None, fill_value=None
  7321. ) -> DataFrame:
  7322. return self._flex_arith_method(
  7323. other, operator.mod, level=level, fill_value=fill_value, axis=axis
  7324. )
  7325. @Appender(ops.make_flex_doc("rmod", "dataframe"))
  7326. def rmod(
  7327. self, other, axis: Axis = "columns", level=None, fill_value=None
  7328. ) -> DataFrame:
  7329. return self._flex_arith_method(
  7330. other, roperator.rmod, level=level, fill_value=fill_value, axis=axis
  7331. )
  7332. @Appender(ops.make_flex_doc("pow", "dataframe"))
  7333. def pow(
  7334. self, other, axis: Axis = "columns", level=None, fill_value=None
  7335. ) -> DataFrame:
  7336. return self._flex_arith_method(
  7337. other, operator.pow, level=level, fill_value=fill_value, axis=axis
  7338. )
  7339. @Appender(ops.make_flex_doc("rpow", "dataframe"))
  7340. def rpow(
  7341. self, other, axis: Axis = "columns", level=None, fill_value=None
  7342. ) -> DataFrame:
  7343. return self._flex_arith_method(
  7344. other, roperator.rpow, level=level, fill_value=fill_value, axis=axis
  7345. )
  7346. # ----------------------------------------------------------------------
  7347. # Combination-Related
  7348. @doc(
  7349. _shared_docs["compare"],
  7350. dedent(
  7351. """
  7352. Returns
  7353. -------
  7354. DataFrame
  7355. DataFrame that shows the differences stacked side by side.
  7356. The resulting index will be a MultiIndex with 'self' and 'other'
  7357. stacked alternately at the inner level.
  7358. Raises
  7359. ------
  7360. ValueError
  7361. When the two DataFrames don't have identical labels or shape.
  7362. See Also
  7363. --------
  7364. Series.compare : Compare with another Series and show differences.
  7365. DataFrame.equals : Test whether two objects contain the same elements.
  7366. Notes
  7367. -----
  7368. Matching NaNs will not appear as a difference.
  7369. Can only compare identically-labeled
  7370. (i.e. same shape, identical row and column labels) DataFrames
  7371. Examples
  7372. --------
  7373. >>> df = pd.DataFrame(
  7374. ... {{
  7375. ... "col1": ["a", "a", "b", "b", "a"],
  7376. ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
  7377. ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
  7378. ... }},
  7379. ... columns=["col1", "col2", "col3"],
  7380. ... )
  7381. >>> df
  7382. col1 col2 col3
  7383. 0 a 1.0 1.0
  7384. 1 a 2.0 2.0
  7385. 2 b 3.0 3.0
  7386. 3 b NaN 4.0
  7387. 4 a 5.0 5.0
  7388. >>> df2 = df.copy()
  7389. >>> df2.loc[0, 'col1'] = 'c'
  7390. >>> df2.loc[2, 'col3'] = 4.0
  7391. >>> df2
  7392. col1 col2 col3
  7393. 0 c 1.0 1.0
  7394. 1 a 2.0 2.0
  7395. 2 b 3.0 4.0
  7396. 3 b NaN 4.0
  7397. 4 a 5.0 5.0
  7398. Align the differences on columns
  7399. >>> df.compare(df2)
  7400. col1 col3
  7401. self other self other
  7402. 0 a c NaN NaN
  7403. 2 NaN NaN 3.0 4.0
  7404. Assign result_names
  7405. >>> df.compare(df2, result_names=("left", "right"))
  7406. col1 col3
  7407. left right left right
  7408. 0 a c NaN NaN
  7409. 2 NaN NaN 3.0 4.0
  7410. Stack the differences on rows
  7411. >>> df.compare(df2, align_axis=0)
  7412. col1 col3
  7413. 0 self a NaN
  7414. other c NaN
  7415. 2 self NaN 3.0
  7416. other NaN 4.0
  7417. Keep the equal values
  7418. >>> df.compare(df2, keep_equal=True)
  7419. col1 col3
  7420. self other self other
  7421. 0 a c 1.0 1.0
  7422. 2 b b 3.0 4.0
  7423. Keep all original rows and columns
  7424. >>> df.compare(df2, keep_shape=True)
  7425. col1 col2 col3
  7426. self other self other self other
  7427. 0 a c NaN NaN NaN NaN
  7428. 1 NaN NaN NaN NaN NaN NaN
  7429. 2 NaN NaN NaN NaN 3.0 4.0
  7430. 3 NaN NaN NaN NaN NaN NaN
  7431. 4 NaN NaN NaN NaN NaN NaN
  7432. Keep all original rows and columns and also all original values
  7433. >>> df.compare(df2, keep_shape=True, keep_equal=True)
  7434. col1 col2 col3
  7435. self other self other self other
  7436. 0 a c 1.0 1.0 1.0 1.0
  7437. 1 a a 2.0 2.0 2.0 2.0
  7438. 2 b b 3.0 3.0 3.0 4.0
  7439. 3 b b NaN NaN 4.0 4.0
  7440. 4 a a 5.0 5.0 5.0 5.0
  7441. """
  7442. ),
  7443. klass=_shared_doc_kwargs["klass"],
  7444. )
  7445. def compare(
  7446. self,
  7447. other: DataFrame,
  7448. align_axis: Axis = 1,
  7449. keep_shape: bool = False,
  7450. keep_equal: bool = False,
  7451. result_names: Suffixes = ("self", "other"),
  7452. ) -> DataFrame:
  7453. return super().compare(
  7454. other=other,
  7455. align_axis=align_axis,
  7456. keep_shape=keep_shape,
  7457. keep_equal=keep_equal,
  7458. result_names=result_names,
  7459. )
  7460. def combine(
  7461. self,
  7462. other: DataFrame,
  7463. func: Callable[[Series, Series], Series | Hashable],
  7464. fill_value=None,
  7465. overwrite: bool = True,
  7466. ) -> DataFrame:
  7467. """
  7468. Perform column-wise combine with another DataFrame.
  7469. Combines a DataFrame with `other` DataFrame using `func`
  7470. to element-wise combine columns. The row and column indexes of the
  7471. resulting DataFrame will be the union of the two.
  7472. Parameters
  7473. ----------
  7474. other : DataFrame
  7475. The DataFrame to merge column-wise.
  7476. func : function
  7477. Function that takes two series as inputs and return a Series or a
  7478. scalar. Used to merge the two dataframes column by columns.
  7479. fill_value : scalar value, default None
  7480. The value to fill NaNs with prior to passing any column to the
  7481. merge func.
  7482. overwrite : bool, default True
  7483. If True, columns in `self` that do not exist in `other` will be
  7484. overwritten with NaNs.
  7485. Returns
  7486. -------
  7487. DataFrame
  7488. Combination of the provided DataFrames.
  7489. See Also
  7490. --------
  7491. DataFrame.combine_first : Combine two DataFrame objects and default to
  7492. non-null values in frame calling the method.
  7493. Examples
  7494. --------
  7495. Combine using a simple function that chooses the smaller column.
  7496. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  7497. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  7498. >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
  7499. >>> df1.combine(df2, take_smaller)
  7500. A B
  7501. 0 0 3
  7502. 1 0 3
  7503. Example using a true element-wise combine function.
  7504. >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
  7505. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  7506. >>> df1.combine(df2, np.minimum)
  7507. A B
  7508. 0 1 2
  7509. 1 0 3
  7510. Using `fill_value` fills Nones prior to passing the column to the
  7511. merge function.
  7512. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  7513. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  7514. >>> df1.combine(df2, take_smaller, fill_value=-5)
  7515. A B
  7516. 0 0 -5.0
  7517. 1 0 4.0
  7518. However, if the same element in both dataframes is None, that None
  7519. is preserved
  7520. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  7521. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
  7522. >>> df1.combine(df2, take_smaller, fill_value=-5)
  7523. A B
  7524. 0 0 -5.0
  7525. 1 0 3.0
  7526. Example that demonstrates the use of `overwrite` and behavior when
  7527. the axis differ between the dataframes.
  7528. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  7529. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
  7530. >>> df1.combine(df2, take_smaller)
  7531. A B C
  7532. 0 NaN NaN NaN
  7533. 1 NaN 3.0 -10.0
  7534. 2 NaN 3.0 1.0
  7535. >>> df1.combine(df2, take_smaller, overwrite=False)
  7536. A B C
  7537. 0 0.0 NaN NaN
  7538. 1 0.0 3.0 -10.0
  7539. 2 NaN 3.0 1.0
  7540. Demonstrating the preference of the passed in dataframe.
  7541. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
  7542. >>> df2.combine(df1, take_smaller)
  7543. A B C
  7544. 0 0.0 NaN NaN
  7545. 1 0.0 3.0 NaN
  7546. 2 NaN 3.0 NaN
  7547. >>> df2.combine(df1, take_smaller, overwrite=False)
  7548. A B C
  7549. 0 0.0 NaN NaN
  7550. 1 0.0 3.0 1.0
  7551. 2 NaN 3.0 1.0
  7552. """
  7553. other_idxlen = len(other.index) # save for compare
  7554. this, other = self.align(other, copy=False)
  7555. new_index = this.index
  7556. if other.empty and len(new_index) == len(self.index):
  7557. return self.copy()
  7558. if self.empty and len(other) == other_idxlen:
  7559. return other.copy()
  7560. # sorts if possible; otherwise align above ensures that these are set-equal
  7561. new_columns = this.columns.union(other.columns)
  7562. do_fill = fill_value is not None
  7563. result = {}
  7564. for col in new_columns:
  7565. series = this[col]
  7566. other_series = other[col]
  7567. this_dtype = series.dtype
  7568. other_dtype = other_series.dtype
  7569. this_mask = isna(series)
  7570. other_mask = isna(other_series)
  7571. # don't overwrite columns unnecessarily
  7572. # DO propagate if this column is not in the intersection
  7573. if not overwrite and other_mask.all():
  7574. result[col] = this[col].copy()
  7575. continue
  7576. if do_fill:
  7577. series = series.copy()
  7578. other_series = other_series.copy()
  7579. series[this_mask] = fill_value
  7580. other_series[other_mask] = fill_value
  7581. if col not in self.columns:
  7582. # If self DataFrame does not have col in other DataFrame,
  7583. # try to promote series, which is all NaN, as other_dtype.
  7584. new_dtype = other_dtype
  7585. try:
  7586. series = series.astype(new_dtype, copy=False)
  7587. except ValueError:
  7588. # e.g. new_dtype is integer types
  7589. pass
  7590. else:
  7591. # if we have different dtypes, possibly promote
  7592. new_dtype = find_common_type([this_dtype, other_dtype])
  7593. series = series.astype(new_dtype, copy=False)
  7594. other_series = other_series.astype(new_dtype, copy=False)
  7595. arr = func(series, other_series)
  7596. if isinstance(new_dtype, np.dtype):
  7597. # if new_dtype is an EA Dtype, then `func` is expected to return
  7598. # the correct dtype without any additional casting
  7599. # error: No overload variant of "maybe_downcast_to_dtype" matches
  7600. # argument types "Union[Series, Hashable]", "dtype[Any]"
  7601. arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
  7602. arr, new_dtype
  7603. )
  7604. result[col] = arr
  7605. # convert_objects just in case
  7606. frame_result = self._constructor(result, index=new_index, columns=new_columns)
  7607. return frame_result.__finalize__(self, method="combine")
  7608. def combine_first(self, other: DataFrame) -> DataFrame:
  7609. """
  7610. Update null elements with value in the same location in `other`.
  7611. Combine two DataFrame objects by filling null values in one DataFrame
  7612. with non-null values from other DataFrame. The row and column indexes
  7613. of the resulting DataFrame will be the union of the two. The resulting
  7614. dataframe contains the 'first' dataframe values and overrides the
  7615. second one values where both first.loc[index, col] and
  7616. second.loc[index, col] are not missing values, upon calling
  7617. first.combine_first(second).
  7618. Parameters
  7619. ----------
  7620. other : DataFrame
  7621. Provided DataFrame to use to fill null values.
  7622. Returns
  7623. -------
  7624. DataFrame
  7625. The result of combining the provided DataFrame with the other object.
  7626. See Also
  7627. --------
  7628. DataFrame.combine : Perform series-wise operation on two DataFrames
  7629. using a given function.
  7630. Examples
  7631. --------
  7632. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
  7633. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  7634. >>> df1.combine_first(df2)
  7635. A B
  7636. 0 1.0 3.0
  7637. 1 0.0 4.0
  7638. Null values still persist if the location of that null value
  7639. does not exist in `other`
  7640. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
  7641. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
  7642. >>> df1.combine_first(df2)
  7643. A B C
  7644. 0 NaN 4.0 NaN
  7645. 1 0.0 3.0 1.0
  7646. 2 NaN 3.0 1.0
  7647. """
  7648. from pandas.core.computation import expressions
  7649. def combiner(x: Series, y: Series):
  7650. mask = x.isna()._values
  7651. x_values = x._values
  7652. y_values = y._values
  7653. # If the column y in other DataFrame is not in first DataFrame,
  7654. # just return y_values.
  7655. if y.name not in self.columns:
  7656. return y_values
  7657. return expressions.where(mask, y_values, x_values)
  7658. if len(other) == 0:
  7659. combined = self.reindex(
  7660. self.columns.append(other.columns.difference(self.columns)), axis=1
  7661. )
  7662. combined = combined.astype(other.dtypes)
  7663. else:
  7664. combined = self.combine(other, combiner, overwrite=False)
  7665. dtypes = {
  7666. col: find_common_type([self.dtypes[col], other.dtypes[col]])
  7667. for col in self.columns.intersection(other.columns)
  7668. if combined.dtypes[col] != self.dtypes[col]
  7669. }
  7670. if dtypes:
  7671. combined = combined.astype(dtypes)
  7672. return combined.__finalize__(self, method="combine_first")
  7673. def update(
  7674. self,
  7675. other,
  7676. join: UpdateJoin = "left",
  7677. overwrite: bool = True,
  7678. filter_func=None,
  7679. errors: IgnoreRaise = "ignore",
  7680. ) -> None:
  7681. """
  7682. Modify in place using non-NA values from another DataFrame.
  7683. Aligns on indices. There is no return value.
  7684. Parameters
  7685. ----------
  7686. other : DataFrame, or object coercible into a DataFrame
  7687. Should have at least one matching index/column label
  7688. with the original DataFrame. If a Series is passed,
  7689. its name attribute must be set, and that will be
  7690. used as the column name to align with the original DataFrame.
  7691. join : {'left'}, default 'left'
  7692. Only left join is implemented, keeping the index and columns of the
  7693. original object.
  7694. overwrite : bool, default True
  7695. How to handle non-NA values for overlapping keys:
  7696. * True: overwrite original DataFrame's values
  7697. with values from `other`.
  7698. * False: only update values that are NA in
  7699. the original DataFrame.
  7700. filter_func : callable(1d-array) -> bool 1d-array, optional
  7701. Can choose to replace values other than NA. Return True for values
  7702. that should be updated.
  7703. errors : {'raise', 'ignore'}, default 'ignore'
  7704. If 'raise', will raise a ValueError if the DataFrame and `other`
  7705. both contain non-NA data in the same place.
  7706. Returns
  7707. -------
  7708. None
  7709. This method directly changes calling object.
  7710. Raises
  7711. ------
  7712. ValueError
  7713. * When `errors='raise'` and there's overlapping non-NA data.
  7714. * When `errors` is not either `'ignore'` or `'raise'`
  7715. NotImplementedError
  7716. * If `join != 'left'`
  7717. See Also
  7718. --------
  7719. dict.update : Similar method for dictionaries.
  7720. DataFrame.merge : For column(s)-on-column(s) operations.
  7721. Examples
  7722. --------
  7723. >>> df = pd.DataFrame({'A': [1, 2, 3],
  7724. ... 'B': [400, 500, 600]})
  7725. >>> new_df = pd.DataFrame({'B': [4, 5, 6],
  7726. ... 'C': [7, 8, 9]})
  7727. >>> df.update(new_df)
  7728. >>> df
  7729. A B
  7730. 0 1 4
  7731. 1 2 5
  7732. 2 3 6
  7733. The DataFrame's length does not increase as a result of the update,
  7734. only values at matching index/column labels are updated.
  7735. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  7736. ... 'B': ['x', 'y', 'z']})
  7737. >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
  7738. >>> df.update(new_df)
  7739. >>> df
  7740. A B
  7741. 0 a d
  7742. 1 b e
  7743. 2 c f
  7744. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  7745. ... 'B': ['x', 'y', 'z']})
  7746. >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2])
  7747. >>> df.update(new_df)
  7748. >>> df
  7749. A B
  7750. 0 a d
  7751. 1 b y
  7752. 2 c f
  7753. For Series, its name attribute must be set.
  7754. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  7755. ... 'B': ['x', 'y', 'z']})
  7756. >>> new_column = pd.Series(['d', 'e', 'f'], name='B')
  7757. >>> df.update(new_column)
  7758. >>> df
  7759. A B
  7760. 0 a d
  7761. 1 b e
  7762. 2 c f
  7763. If `other` contains NaNs the corresponding values are not updated
  7764. in the original dataframe.
  7765. >>> df = pd.DataFrame({'A': [1, 2, 3],
  7766. ... 'B': [400., 500., 600.]})
  7767. >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
  7768. >>> df.update(new_df)
  7769. >>> df
  7770. A B
  7771. 0 1 4.0
  7772. 1 2 500.0
  7773. 2 3 6.0
  7774. """
  7775. if not PYPY and not WARNING_CHECK_DISABLED and using_copy_on_write():
  7776. if sys.getrefcount(self) <= REF_COUNT:
  7777. warnings.warn(
  7778. _chained_assignment_method_msg,
  7779. ChainedAssignmentError,
  7780. stacklevel=2,
  7781. )
  7782. elif (
  7783. not PYPY
  7784. and not WARNING_CHECK_DISABLED
  7785. and not using_copy_on_write()
  7786. and self._is_view_after_cow_rules()
  7787. ):
  7788. if sys.getrefcount(self) <= REF_COUNT:
  7789. warnings.warn(
  7790. _chained_assignment_warning_method_msg,
  7791. FutureWarning,
  7792. stacklevel=2,
  7793. )
  7794. # TODO: Support other joins
  7795. if join != "left": # pragma: no cover
  7796. raise NotImplementedError("Only left join is supported")
  7797. if errors not in ["ignore", "raise"]:
  7798. raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
  7799. if not isinstance(other, DataFrame):
  7800. other = DataFrame(other)
  7801. other = other.reindex(self.index)
  7802. for col in self.columns.intersection(other.columns):
  7803. this = self[col]._values
  7804. that = other[col]._values
  7805. if filter_func is not None:
  7806. mask = ~filter_func(this) | isna(that)
  7807. else:
  7808. if errors == "raise":
  7809. mask_this = notna(that)
  7810. mask_that = notna(this)
  7811. if any(mask_this & mask_that):
  7812. raise ValueError("Data overlaps.")
  7813. if overwrite:
  7814. mask = isna(that)
  7815. else:
  7816. mask = notna(this)
  7817. # don't overwrite columns unnecessarily
  7818. if mask.all():
  7819. continue
  7820. with warnings.catch_warnings():
  7821. warnings.filterwarnings(
  7822. "ignore",
  7823. message="Downcasting behavior",
  7824. category=FutureWarning,
  7825. )
  7826. # GH#57124 - `that` might get upcasted because of NA values, and then
  7827. # downcasted in where because of the mask. Ignoring the warning
  7828. # is a stopgap, will replace with a new implementation of update
  7829. # in 3.0.
  7830. self.loc[:, col] = self[col].where(mask, that)
  7831. # ----------------------------------------------------------------------
  7832. # Data reshaping
  7833. @Appender(
  7834. dedent(
  7835. """
  7836. Examples
  7837. --------
  7838. >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
  7839. ... 'Parrot', 'Parrot'],
  7840. ... 'Max Speed': [380., 370., 24., 26.]})
  7841. >>> df
  7842. Animal Max Speed
  7843. 0 Falcon 380.0
  7844. 1 Falcon 370.0
  7845. 2 Parrot 24.0
  7846. 3 Parrot 26.0
  7847. >>> df.groupby(['Animal']).mean()
  7848. Max Speed
  7849. Animal
  7850. Falcon 375.0
  7851. Parrot 25.0
  7852. **Hierarchical Indexes**
  7853. We can groupby different levels of a hierarchical index
  7854. using the `level` parameter:
  7855. >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
  7856. ... ['Captive', 'Wild', 'Captive', 'Wild']]
  7857. >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
  7858. >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
  7859. ... index=index)
  7860. >>> df
  7861. Max Speed
  7862. Animal Type
  7863. Falcon Captive 390.0
  7864. Wild 350.0
  7865. Parrot Captive 30.0
  7866. Wild 20.0
  7867. >>> df.groupby(level=0).mean()
  7868. Max Speed
  7869. Animal
  7870. Falcon 370.0
  7871. Parrot 25.0
  7872. >>> df.groupby(level="Type").mean()
  7873. Max Speed
  7874. Type
  7875. Captive 210.0
  7876. Wild 185.0
  7877. We can also choose to include NA in group keys or not by setting
  7878. `dropna` parameter, the default setting is `True`.
  7879. >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
  7880. >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
  7881. >>> df.groupby(by=["b"]).sum()
  7882. a c
  7883. b
  7884. 1.0 2 3
  7885. 2.0 2 5
  7886. >>> df.groupby(by=["b"], dropna=False).sum()
  7887. a c
  7888. b
  7889. 1.0 2 3
  7890. 2.0 2 5
  7891. NaN 1 4
  7892. >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
  7893. >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
  7894. >>> df.groupby(by="a").sum()
  7895. b c
  7896. a
  7897. a 13.0 13.0
  7898. b 12.3 123.0
  7899. >>> df.groupby(by="a", dropna=False).sum()
  7900. b c
  7901. a
  7902. a 13.0 13.0
  7903. b 12.3 123.0
  7904. NaN 12.3 33.0
  7905. When using ``.apply()``, use ``group_keys`` to include or exclude the
  7906. group keys. The ``group_keys`` argument defaults to ``True`` (include).
  7907. >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
  7908. ... 'Parrot', 'Parrot'],
  7909. ... 'Max Speed': [380., 370., 24., 26.]})
  7910. >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x)
  7911. Max Speed
  7912. Animal
  7913. Falcon 0 380.0
  7914. 1 370.0
  7915. Parrot 2 24.0
  7916. 3 26.0
  7917. >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x)
  7918. Max Speed
  7919. 0 380.0
  7920. 1 370.0
  7921. 2 24.0
  7922. 3 26.0
  7923. """
  7924. )
  7925. )
  7926. @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
  7927. def groupby(
  7928. self,
  7929. by=None,
  7930. axis: Axis | lib.NoDefault = lib.no_default,
  7931. level: IndexLabel | None = None,
  7932. as_index: bool = True,
  7933. sort: bool = True,
  7934. group_keys: bool = True,
  7935. observed: bool | lib.NoDefault = lib.no_default,
  7936. dropna: bool = True,
  7937. ) -> DataFrameGroupBy:
  7938. if axis is not lib.no_default:
  7939. axis = self._get_axis_number(axis)
  7940. if axis == 1:
  7941. warnings.warn(
  7942. "DataFrame.groupby with axis=1 is deprecated. Do "
  7943. "`frame.T.groupby(...)` without axis instead.",
  7944. FutureWarning,
  7945. stacklevel=find_stack_level(),
  7946. )
  7947. else:
  7948. warnings.warn(
  7949. "The 'axis' keyword in DataFrame.groupby is deprecated and "
  7950. "will be removed in a future version.",
  7951. FutureWarning,
  7952. stacklevel=find_stack_level(),
  7953. )
  7954. else:
  7955. axis = 0
  7956. from pandas.core.groupby.generic import DataFrameGroupBy
  7957. if level is None and by is None:
  7958. raise TypeError("You have to supply one of 'by' and 'level'")
  7959. return DataFrameGroupBy(
  7960. obj=self,
  7961. keys=by,
  7962. axis=axis,
  7963. level=level,
  7964. as_index=as_index,
  7965. sort=sort,
  7966. group_keys=group_keys,
  7967. observed=observed,
  7968. dropna=dropna,
  7969. )
  7970. _shared_docs[
  7971. "pivot"
  7972. ] = """
  7973. Return reshaped DataFrame organized by given index / column values.
  7974. Reshape data (produce a "pivot" table) based on column values. Uses
  7975. unique values from specified `index` / `columns` to form axes of the
  7976. resulting DataFrame. This function does not support data
  7977. aggregation, multiple values will result in a MultiIndex in the
  7978. columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
  7979. Parameters
  7980. ----------%s
  7981. columns : str or object or a list of str
  7982. Column to use to make new frame's columns.
  7983. index : str or object or a list of str, optional
  7984. Column to use to make new frame's index. If not given, uses existing index.
  7985. values : str, object or a list of the previous, optional
  7986. Column(s) to use for populating new frame's values. If not
  7987. specified, all remaining columns will be used and the result will
  7988. have hierarchically indexed columns.
  7989. Returns
  7990. -------
  7991. DataFrame
  7992. Returns reshaped DataFrame.
  7993. Raises
  7994. ------
  7995. ValueError:
  7996. When there are any `index`, `columns` combinations with multiple
  7997. values. `DataFrame.pivot_table` when you need to aggregate.
  7998. See Also
  7999. --------
  8000. DataFrame.pivot_table : Generalization of pivot that can handle
  8001. duplicate values for one index/column pair.
  8002. DataFrame.unstack : Pivot based on the index values instead of a
  8003. column.
  8004. wide_to_long : Wide panel to long format. Less flexible but more
  8005. user-friendly than melt.
  8006. Notes
  8007. -----
  8008. For finer-tuned control, see hierarchical indexing documentation along
  8009. with the related stack/unstack methods.
  8010. Reference :ref:`the user guide <reshaping.pivot>` for more examples.
  8011. Examples
  8012. --------
  8013. >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
  8014. ... 'two'],
  8015. ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  8016. ... 'baz': [1, 2, 3, 4, 5, 6],
  8017. ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
  8018. >>> df
  8019. foo bar baz zoo
  8020. 0 one A 1 x
  8021. 1 one B 2 y
  8022. 2 one C 3 z
  8023. 3 two A 4 q
  8024. 4 two B 5 w
  8025. 5 two C 6 t
  8026. >>> df.pivot(index='foo', columns='bar', values='baz')
  8027. bar A B C
  8028. foo
  8029. one 1 2 3
  8030. two 4 5 6
  8031. >>> df.pivot(index='foo', columns='bar')['baz']
  8032. bar A B C
  8033. foo
  8034. one 1 2 3
  8035. two 4 5 6
  8036. >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
  8037. baz zoo
  8038. bar A B C A B C
  8039. foo
  8040. one 1 2 3 x y z
  8041. two 4 5 6 q w t
  8042. You could also assign a list of column names or a list of index names.
  8043. >>> df = pd.DataFrame({
  8044. ... "lev1": [1, 1, 1, 2, 2, 2],
  8045. ... "lev2": [1, 1, 2, 1, 1, 2],
  8046. ... "lev3": [1, 2, 1, 2, 1, 2],
  8047. ... "lev4": [1, 2, 3, 4, 5, 6],
  8048. ... "values": [0, 1, 2, 3, 4, 5]})
  8049. >>> df
  8050. lev1 lev2 lev3 lev4 values
  8051. 0 1 1 1 1 0
  8052. 1 1 1 2 2 1
  8053. 2 1 2 1 3 2
  8054. 3 2 1 2 4 3
  8055. 4 2 1 1 5 4
  8056. 5 2 2 2 6 5
  8057. >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
  8058. lev2 1 2
  8059. lev3 1 2 1 2
  8060. lev1
  8061. 1 0.0 1.0 2.0 NaN
  8062. 2 4.0 3.0 NaN 5.0
  8063. >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
  8064. lev3 1 2
  8065. lev1 lev2
  8066. 1 1 0.0 1.0
  8067. 2 2.0 NaN
  8068. 2 1 4.0 3.0
  8069. 2 NaN 5.0
  8070. A ValueError is raised if there are any duplicates.
  8071. >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
  8072. ... "bar": ['A', 'A', 'B', 'C'],
  8073. ... "baz": [1, 2, 3, 4]})
  8074. >>> df
  8075. foo bar baz
  8076. 0 one A 1
  8077. 1 one A 2
  8078. 2 two B 3
  8079. 3 two C 4
  8080. Notice that the first two rows are the same for our `index`
  8081. and `columns` arguments.
  8082. >>> df.pivot(index='foo', columns='bar', values='baz')
  8083. Traceback (most recent call last):
  8084. ...
  8085. ValueError: Index contains duplicate entries, cannot reshape
  8086. """
  8087. @Substitution("")
  8088. @Appender(_shared_docs["pivot"])
  8089. def pivot(
  8090. self, *, columns, index=lib.no_default, values=lib.no_default
  8091. ) -> DataFrame:
  8092. from pandas.core.reshape.pivot import pivot
  8093. return pivot(self, index=index, columns=columns, values=values)
  8094. _shared_docs[
  8095. "pivot_table"
  8096. ] = """
  8097. Create a spreadsheet-style pivot table as a DataFrame.
  8098. The levels in the pivot table will be stored in MultiIndex objects
  8099. (hierarchical indexes) on the index and columns of the result DataFrame.
  8100. Parameters
  8101. ----------%s
  8102. values : list-like or scalar, optional
  8103. Column or columns to aggregate.
  8104. index : column, Grouper, array, or list of the previous
  8105. Keys to group by on the pivot table index. If a list is passed,
  8106. it can contain any of the other types (except list). If an array is
  8107. passed, it must be the same length as the data and will be used in
  8108. the same manner as column values.
  8109. columns : column, Grouper, array, or list of the previous
  8110. Keys to group by on the pivot table column. If a list is passed,
  8111. it can contain any of the other types (except list). If an array is
  8112. passed, it must be the same length as the data and will be used in
  8113. the same manner as column values.
  8114. aggfunc : function, list of functions, dict, default "mean"
  8115. If a list of functions is passed, the resulting pivot table will have
  8116. hierarchical columns whose top level are the function names
  8117. (inferred from the function objects themselves).
  8118. If a dict is passed, the key is column to aggregate and the value is
  8119. function or list of functions. If ``margin=True``, aggfunc will be
  8120. used to calculate the partial aggregates.
  8121. fill_value : scalar, default None
  8122. Value to replace missing values with (in the resulting pivot table,
  8123. after aggregation).
  8124. margins : bool, default False
  8125. If ``margins=True``, special ``All`` columns and rows
  8126. will be added with partial group aggregates across the categories
  8127. on the rows and columns.
  8128. dropna : bool, default True
  8129. Do not include columns whose entries are all NaN. If True,
  8130. rows with a NaN value in any column will be omitted before
  8131. computing margins.
  8132. margins_name : str, default 'All'
  8133. Name of the row / column that will contain the totals
  8134. when margins is True.
  8135. observed : bool, default False
  8136. This only applies if any of the groupers are Categoricals.
  8137. If True: only show observed values for categorical groupers.
  8138. If False: show all values for categorical groupers.
  8139. .. deprecated:: 2.2.0
  8140. The default value of ``False`` is deprecated and will change to
  8141. ``True`` in a future version of pandas.
  8142. sort : bool, default True
  8143. Specifies if the result should be sorted.
  8144. .. versionadded:: 1.3.0
  8145. Returns
  8146. -------
  8147. DataFrame
  8148. An Excel style pivot table.
  8149. See Also
  8150. --------
  8151. DataFrame.pivot : Pivot without aggregation that can handle
  8152. non-numeric data.
  8153. DataFrame.melt: Unpivot a DataFrame from wide to long format,
  8154. optionally leaving identifiers set.
  8155. wide_to_long : Wide panel to long format. Less flexible but more
  8156. user-friendly than melt.
  8157. Notes
  8158. -----
  8159. Reference :ref:`the user guide <reshaping.pivot>` for more examples.
  8160. Examples
  8161. --------
  8162. >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
  8163. ... "bar", "bar", "bar", "bar"],
  8164. ... "B": ["one", "one", "one", "two", "two",
  8165. ... "one", "one", "two", "two"],
  8166. ... "C": ["small", "large", "large", "small",
  8167. ... "small", "large", "small", "small",
  8168. ... "large"],
  8169. ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  8170. ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
  8171. >>> df
  8172. A B C D E
  8173. 0 foo one small 1 2
  8174. 1 foo one large 2 4
  8175. 2 foo one large 2 5
  8176. 3 foo two small 3 5
  8177. 4 foo two small 3 6
  8178. 5 bar one large 4 6
  8179. 6 bar one small 5 8
  8180. 7 bar two small 6 9
  8181. 8 bar two large 7 9
  8182. This first example aggregates values by taking the sum.
  8183. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  8184. ... columns=['C'], aggfunc="sum")
  8185. >>> table
  8186. C large small
  8187. A B
  8188. bar one 4.0 5.0
  8189. two 7.0 6.0
  8190. foo one 4.0 1.0
  8191. two NaN 6.0
  8192. We can also fill missing values using the `fill_value` parameter.
  8193. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  8194. ... columns=['C'], aggfunc="sum", fill_value=0)
  8195. >>> table
  8196. C large small
  8197. A B
  8198. bar one 4 5
  8199. two 7 6
  8200. foo one 4 1
  8201. two 0 6
  8202. The next example aggregates by taking the mean across multiple columns.
  8203. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  8204. ... aggfunc={'D': "mean", 'E': "mean"})
  8205. >>> table
  8206. D E
  8207. A C
  8208. bar large 5.500000 7.500000
  8209. small 5.500000 8.500000
  8210. foo large 2.000000 4.500000
  8211. small 2.333333 4.333333
  8212. We can also calculate multiple types of aggregations for any given
  8213. value column.
  8214. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  8215. ... aggfunc={'D': "mean",
  8216. ... 'E': ["min", "max", "mean"]})
  8217. >>> table
  8218. D E
  8219. mean max mean min
  8220. A C
  8221. bar large 5.500000 9 7.500000 6
  8222. small 5.500000 9 8.500000 8
  8223. foo large 2.000000 5 4.500000 4
  8224. small 2.333333 6 4.333333 2
  8225. """
  8226. @Substitution("")
  8227. @Appender(_shared_docs["pivot_table"])
  8228. def pivot_table(
  8229. self,
  8230. values=None,
  8231. index=None,
  8232. columns=None,
  8233. aggfunc: AggFuncType = "mean",
  8234. fill_value=None,
  8235. margins: bool = False,
  8236. dropna: bool = True,
  8237. margins_name: Level = "All",
  8238. observed: bool | lib.NoDefault = lib.no_default,
  8239. sort: bool = True,
  8240. ) -> DataFrame:
  8241. from pandas.core.reshape.pivot import pivot_table
  8242. return pivot_table(
  8243. self,
  8244. values=values,
  8245. index=index,
  8246. columns=columns,
  8247. aggfunc=aggfunc,
  8248. fill_value=fill_value,
  8249. margins=margins,
  8250. dropna=dropna,
  8251. margins_name=margins_name,
  8252. observed=observed,
  8253. sort=sort,
  8254. )
  8255. def stack(
  8256. self,
  8257. level: IndexLabel = -1,
  8258. dropna: bool | lib.NoDefault = lib.no_default,
  8259. sort: bool | lib.NoDefault = lib.no_default,
  8260. future_stack: bool = False,
  8261. ):
  8262. """
  8263. Stack the prescribed level(s) from columns to index.
  8264. Return a reshaped DataFrame or Series having a multi-level
  8265. index with one or more new inner-most levels compared to the current
  8266. DataFrame. The new inner-most levels are created by pivoting the
  8267. columns of the current dataframe:
  8268. - if the columns have a single level, the output is a Series;
  8269. - if the columns have multiple levels, the new index
  8270. level(s) is (are) taken from the prescribed level(s) and
  8271. the output is a DataFrame.
  8272. Parameters
  8273. ----------
  8274. level : int, str, list, default -1
  8275. Level(s) to stack from the column axis onto the index
  8276. axis, defined as one index or label, or a list of indices
  8277. or labels.
  8278. dropna : bool, default True
  8279. Whether to drop rows in the resulting Frame/Series with
  8280. missing values. Stacking a column level onto the index
  8281. axis can create combinations of index and column values
  8282. that are missing from the original dataframe. See Examples
  8283. section.
  8284. sort : bool, default True
  8285. Whether to sort the levels of the resulting MultiIndex.
  8286. future_stack : bool, default False
  8287. Whether to use the new implementation that will replace the current
  8288. implementation in pandas 3.0. When True, dropna and sort have no impact
  8289. on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release
  8290. notes <whatsnew_210.enhancements.new_stack>` for more details.
  8291. Returns
  8292. -------
  8293. DataFrame or Series
  8294. Stacked dataframe or series.
  8295. See Also
  8296. --------
  8297. DataFrame.unstack : Unstack prescribed level(s) from index axis
  8298. onto column axis.
  8299. DataFrame.pivot : Reshape dataframe from long format to wide
  8300. format.
  8301. DataFrame.pivot_table : Create a spreadsheet-style pivot table
  8302. as a DataFrame.
  8303. Notes
  8304. -----
  8305. The function is named by analogy with a collection of books
  8306. being reorganized from being side by side on a horizontal
  8307. position (the columns of the dataframe) to being stacked
  8308. vertically on top of each other (in the index of the
  8309. dataframe).
  8310. Reference :ref:`the user guide <reshaping.stacking>` for more examples.
  8311. Examples
  8312. --------
  8313. **Single level columns**
  8314. >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
  8315. ... index=['cat', 'dog'],
  8316. ... columns=['weight', 'height'])
  8317. Stacking a dataframe with a single level column axis returns a Series:
  8318. >>> df_single_level_cols
  8319. weight height
  8320. cat 0 1
  8321. dog 2 3
  8322. >>> df_single_level_cols.stack(future_stack=True)
  8323. cat weight 0
  8324. height 1
  8325. dog weight 2
  8326. height 3
  8327. dtype: int64
  8328. **Multi level columns: simple case**
  8329. >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  8330. ... ('weight', 'pounds')])
  8331. >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
  8332. ... index=['cat', 'dog'],
  8333. ... columns=multicol1)
  8334. Stacking a dataframe with a multi-level column axis:
  8335. >>> df_multi_level_cols1
  8336. weight
  8337. kg pounds
  8338. cat 1 2
  8339. dog 2 4
  8340. >>> df_multi_level_cols1.stack(future_stack=True)
  8341. weight
  8342. cat kg 1
  8343. pounds 2
  8344. dog kg 2
  8345. pounds 4
  8346. **Missing values**
  8347. >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  8348. ... ('height', 'm')])
  8349. >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
  8350. ... index=['cat', 'dog'],
  8351. ... columns=multicol2)
  8352. It is common to have missing values when stacking a dataframe
  8353. with multi-level columns, as the stacked dataframe typically
  8354. has more values than the original dataframe. Missing values
  8355. are filled with NaNs:
  8356. >>> df_multi_level_cols2
  8357. weight height
  8358. kg m
  8359. cat 1.0 2.0
  8360. dog 3.0 4.0
  8361. >>> df_multi_level_cols2.stack(future_stack=True)
  8362. weight height
  8363. cat kg 1.0 NaN
  8364. m NaN 2.0
  8365. dog kg 3.0 NaN
  8366. m NaN 4.0
  8367. **Prescribing the level(s) to be stacked**
  8368. The first parameter controls which level or levels are stacked:
  8369. >>> df_multi_level_cols2.stack(0, future_stack=True)
  8370. kg m
  8371. cat weight 1.0 NaN
  8372. height NaN 2.0
  8373. dog weight 3.0 NaN
  8374. height NaN 4.0
  8375. >>> df_multi_level_cols2.stack([0, 1], future_stack=True)
  8376. cat weight kg 1.0
  8377. height m 2.0
  8378. dog weight kg 3.0
  8379. height m 4.0
  8380. dtype: float64
  8381. """
  8382. if not future_stack:
  8383. from pandas.core.reshape.reshape import (
  8384. stack,
  8385. stack_multiple,
  8386. )
  8387. if (
  8388. dropna is not lib.no_default
  8389. or sort is not lib.no_default
  8390. or self.columns.nlevels > 1
  8391. ):
  8392. warnings.warn(
  8393. "The previous implementation of stack is deprecated and will be "
  8394. "removed in a future version of pandas. See the What's New notes "
  8395. "for pandas 2.1.0 for details. Specify future_stack=True to adopt "
  8396. "the new implementation and silence this warning.",
  8397. FutureWarning,
  8398. stacklevel=find_stack_level(),
  8399. )
  8400. if dropna is lib.no_default:
  8401. dropna = True
  8402. if sort is lib.no_default:
  8403. sort = True
  8404. if isinstance(level, (tuple, list)):
  8405. result = stack_multiple(self, level, dropna=dropna, sort=sort)
  8406. else:
  8407. result = stack(self, level, dropna=dropna, sort=sort)
  8408. else:
  8409. from pandas.core.reshape.reshape import stack_v3
  8410. if dropna is not lib.no_default:
  8411. raise ValueError(
  8412. "dropna must be unspecified with future_stack=True as the new "
  8413. "implementation does not introduce rows of NA values. This "
  8414. "argument will be removed in a future version of pandas."
  8415. )
  8416. if sort is not lib.no_default:
  8417. raise ValueError(
  8418. "Cannot specify sort with future_stack=True, this argument will be "
  8419. "removed in a future version of pandas. Sort the result using "
  8420. ".sort_index instead."
  8421. )
  8422. if (
  8423. isinstance(level, (tuple, list))
  8424. and not all(lev in self.columns.names for lev in level)
  8425. and not all(isinstance(lev, int) for lev in level)
  8426. ):
  8427. raise ValueError(
  8428. "level should contain all level names or all level "
  8429. "numbers, not a mixture of the two."
  8430. )
  8431. if not isinstance(level, (tuple, list)):
  8432. level = [level]
  8433. level = [self.columns._get_level_number(lev) for lev in level]
  8434. result = stack_v3(self, level)
  8435. return result.__finalize__(self, method="stack")
  8436. def explode(
  8437. self,
  8438. column: IndexLabel,
  8439. ignore_index: bool = False,
  8440. ) -> DataFrame:
  8441. """
  8442. Transform each element of a list-like to a row, replicating index values.
  8443. Parameters
  8444. ----------
  8445. column : IndexLabel
  8446. Column(s) to explode.
  8447. For multiple columns, specify a non-empty list with each element
  8448. be str or tuple, and all specified columns their list-like data
  8449. on same row of the frame must have matching length.
  8450. .. versionadded:: 1.3.0
  8451. Multi-column explode
  8452. ignore_index : bool, default False
  8453. If True, the resulting index will be labeled 0, 1, …, n - 1.
  8454. Returns
  8455. -------
  8456. DataFrame
  8457. Exploded lists to rows of the subset columns;
  8458. index will be duplicated for these rows.
  8459. Raises
  8460. ------
  8461. ValueError :
  8462. * If columns of the frame are not unique.
  8463. * If specified columns to explode is empty list.
  8464. * If specified columns to explode have not matching count of
  8465. elements rowwise in the frame.
  8466. See Also
  8467. --------
  8468. DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
  8469. index labels.
  8470. DataFrame.melt : Unpivot a DataFrame from wide format to long format.
  8471. Series.explode : Explode a DataFrame from list-like columns to long format.
  8472. Notes
  8473. -----
  8474. This routine will explode list-likes including lists, tuples, sets,
  8475. Series, and np.ndarray. The result dtype of the subset rows will
  8476. be object. Scalars will be returned unchanged, and empty list-likes will
  8477. result in a np.nan for that row. In addition, the ordering of rows in the
  8478. output will be non-deterministic when exploding sets.
  8479. Reference :ref:`the user guide <reshaping.explode>` for more examples.
  8480. Examples
  8481. --------
  8482. >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
  8483. ... 'B': 1,
  8484. ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
  8485. >>> df
  8486. A B C
  8487. 0 [0, 1, 2] 1 [a, b, c]
  8488. 1 foo 1 NaN
  8489. 2 [] 1 []
  8490. 3 [3, 4] 1 [d, e]
  8491. Single-column explode.
  8492. >>> df.explode('A')
  8493. A B C
  8494. 0 0 1 [a, b, c]
  8495. 0 1 1 [a, b, c]
  8496. 0 2 1 [a, b, c]
  8497. 1 foo 1 NaN
  8498. 2 NaN 1 []
  8499. 3 3 1 [d, e]
  8500. 3 4 1 [d, e]
  8501. Multi-column explode.
  8502. >>> df.explode(list('AC'))
  8503. A B C
  8504. 0 0 1 a
  8505. 0 1 1 b
  8506. 0 2 1 c
  8507. 1 foo 1 NaN
  8508. 2 NaN 1 NaN
  8509. 3 3 1 d
  8510. 3 4 1 e
  8511. """
  8512. if not self.columns.is_unique:
  8513. duplicate_cols = self.columns[self.columns.duplicated()].tolist()
  8514. raise ValueError(
  8515. f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
  8516. )
  8517. columns: list[Hashable]
  8518. if is_scalar(column) or isinstance(column, tuple):
  8519. columns = [column]
  8520. elif isinstance(column, list) and all(
  8521. is_scalar(c) or isinstance(c, tuple) for c in column
  8522. ):
  8523. if not column:
  8524. raise ValueError("column must be nonempty")
  8525. if len(column) > len(set(column)):
  8526. raise ValueError("column must be unique")
  8527. columns = column
  8528. else:
  8529. raise ValueError("column must be a scalar, tuple, or list thereof")
  8530. df = self.reset_index(drop=True)
  8531. if len(columns) == 1:
  8532. result = df[columns[0]].explode()
  8533. else:
  8534. mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
  8535. counts0 = self[columns[0]].apply(mylen)
  8536. for c in columns[1:]:
  8537. if not all(counts0 == self[c].apply(mylen)):
  8538. raise ValueError("columns must have matching element counts")
  8539. result = DataFrame({c: df[c].explode() for c in columns})
  8540. result = df.drop(columns, axis=1).join(result)
  8541. if ignore_index:
  8542. result.index = default_index(len(result))
  8543. else:
  8544. result.index = self.index.take(result.index)
  8545. result = result.reindex(columns=self.columns, copy=False)
  8546. return result.__finalize__(self, method="explode")
  8547. def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True):
  8548. """
  8549. Pivot a level of the (necessarily hierarchical) index labels.
  8550. Returns a DataFrame having a new level of column labels whose inner-most level
  8551. consists of the pivoted index labels.
  8552. If the index is not a MultiIndex, the output will be a Series
  8553. (the analogue of stack when the columns are not a MultiIndex).
  8554. Parameters
  8555. ----------
  8556. level : int, str, or list of these, default -1 (last level)
  8557. Level(s) of index to unstack, can pass level name.
  8558. fill_value : int, str or dict
  8559. Replace NaN with this value if the unstack produces missing values.
  8560. sort : bool, default True
  8561. Sort the level(s) in the resulting MultiIndex columns.
  8562. Returns
  8563. -------
  8564. Series or DataFrame
  8565. See Also
  8566. --------
  8567. DataFrame.pivot : Pivot a table based on column values.
  8568. DataFrame.stack : Pivot a level of the column labels (inverse operation
  8569. from `unstack`).
  8570. Notes
  8571. -----
  8572. Reference :ref:`the user guide <reshaping.stacking>` for more examples.
  8573. Examples
  8574. --------
  8575. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  8576. ... ('two', 'a'), ('two', 'b')])
  8577. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  8578. >>> s
  8579. one a 1.0
  8580. b 2.0
  8581. two a 3.0
  8582. b 4.0
  8583. dtype: float64
  8584. >>> s.unstack(level=-1)
  8585. a b
  8586. one 1.0 2.0
  8587. two 3.0 4.0
  8588. >>> s.unstack(level=0)
  8589. one two
  8590. a 1.0 3.0
  8591. b 2.0 4.0
  8592. >>> df = s.unstack(level=0)
  8593. >>> df.unstack()
  8594. one a 1.0
  8595. b 2.0
  8596. two a 3.0
  8597. b 4.0
  8598. dtype: float64
  8599. """
  8600. from pandas.core.reshape.reshape import unstack
  8601. result = unstack(self, level, fill_value, sort)
  8602. return result.__finalize__(self, method="unstack")
  8603. @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
  8604. def melt(
  8605. self,
  8606. id_vars=None,
  8607. value_vars=None,
  8608. var_name=None,
  8609. value_name: Hashable = "value",
  8610. col_level: Level | None = None,
  8611. ignore_index: bool = True,
  8612. ) -> DataFrame:
  8613. return melt(
  8614. self,
  8615. id_vars=id_vars,
  8616. value_vars=value_vars,
  8617. var_name=var_name,
  8618. value_name=value_name,
  8619. col_level=col_level,
  8620. ignore_index=ignore_index,
  8621. ).__finalize__(self, method="melt")
  8622. # ----------------------------------------------------------------------
  8623. # Time series-related
  8624. @doc(
  8625. Series.diff,
  8626. klass="DataFrame",
  8627. extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
  8628. "Take difference over rows (0) or columns (1).\n",
  8629. other_klass="Series",
  8630. examples=dedent(
  8631. """
  8632. Difference with previous row
  8633. >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
  8634. ... 'b': [1, 1, 2, 3, 5, 8],
  8635. ... 'c': [1, 4, 9, 16, 25, 36]})
  8636. >>> df
  8637. a b c
  8638. 0 1 1 1
  8639. 1 2 1 4
  8640. 2 3 2 9
  8641. 3 4 3 16
  8642. 4 5 5 25
  8643. 5 6 8 36
  8644. >>> df.diff()
  8645. a b c
  8646. 0 NaN NaN NaN
  8647. 1 1.0 0.0 3.0
  8648. 2 1.0 1.0 5.0
  8649. 3 1.0 1.0 7.0
  8650. 4 1.0 2.0 9.0
  8651. 5 1.0 3.0 11.0
  8652. Difference with previous column
  8653. >>> df.diff(axis=1)
  8654. a b c
  8655. 0 NaN 0 0
  8656. 1 NaN -1 3
  8657. 2 NaN -1 7
  8658. 3 NaN -1 13
  8659. 4 NaN 0 20
  8660. 5 NaN 2 28
  8661. Difference with 3rd previous row
  8662. >>> df.diff(periods=3)
  8663. a b c
  8664. 0 NaN NaN NaN
  8665. 1 NaN NaN NaN
  8666. 2 NaN NaN NaN
  8667. 3 3.0 2.0 15.0
  8668. 4 3.0 4.0 21.0
  8669. 5 3.0 6.0 27.0
  8670. Difference with following row
  8671. >>> df.diff(periods=-1)
  8672. a b c
  8673. 0 -1.0 0.0 -3.0
  8674. 1 -1.0 -1.0 -5.0
  8675. 2 -1.0 -1.0 -7.0
  8676. 3 -1.0 -2.0 -9.0
  8677. 4 -1.0 -3.0 -11.0
  8678. 5 NaN NaN NaN
  8679. Overflow in input dtype
  8680. >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
  8681. >>> df.diff()
  8682. a
  8683. 0 NaN
  8684. 1 255.0"""
  8685. ),
  8686. )
  8687. def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
  8688. if not lib.is_integer(periods):
  8689. if not (is_float(periods) and periods.is_integer()):
  8690. raise ValueError("periods must be an integer")
  8691. periods = int(periods)
  8692. axis = self._get_axis_number(axis)
  8693. if axis == 1:
  8694. if periods != 0:
  8695. # in the periods == 0 case, this is equivalent diff of 0 periods
  8696. # along axis=0, and the Manager method may be somewhat more
  8697. # performant, so we dispatch in that case.
  8698. return self - self.shift(periods, axis=axis)
  8699. # With periods=0 this is equivalent to a diff with axis=0
  8700. axis = 0
  8701. new_data = self._mgr.diff(n=periods)
  8702. res_df = self._constructor_from_mgr(new_data, axes=new_data.axes)
  8703. return res_df.__finalize__(self, "diff")
  8704. # ----------------------------------------------------------------------
  8705. # Function application
  8706. def _gotitem(
  8707. self,
  8708. key: IndexLabel,
  8709. ndim: int,
  8710. subset: DataFrame | Series | None = None,
  8711. ) -> DataFrame | Series:
  8712. """
  8713. Sub-classes to define. Return a sliced object.
  8714. Parameters
  8715. ----------
  8716. key : string / list of selections
  8717. ndim : {1, 2}
  8718. requested ndim of result
  8719. subset : object, default None
  8720. subset to act on
  8721. """
  8722. if subset is None:
  8723. subset = self
  8724. elif subset.ndim == 1: # is Series
  8725. return subset
  8726. # TODO: _shallow_copy(subset)?
  8727. return subset[key]
  8728. _agg_see_also_doc = dedent(
  8729. """
  8730. See Also
  8731. --------
  8732. DataFrame.apply : Perform any type of operations.
  8733. DataFrame.transform : Perform transformation type operations.
  8734. pandas.DataFrame.groupby : Perform operations over groups.
  8735. pandas.DataFrame.resample : Perform operations over resampled bins.
  8736. pandas.DataFrame.rolling : Perform operations over rolling window.
  8737. pandas.DataFrame.expanding : Perform operations over expanding window.
  8738. pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential
  8739. weighted window.
  8740. """
  8741. )
  8742. _agg_examples_doc = dedent(
  8743. """
  8744. Examples
  8745. --------
  8746. >>> df = pd.DataFrame([[1, 2, 3],
  8747. ... [4, 5, 6],
  8748. ... [7, 8, 9],
  8749. ... [np.nan, np.nan, np.nan]],
  8750. ... columns=['A', 'B', 'C'])
  8751. Aggregate these functions over the rows.
  8752. >>> df.agg(['sum', 'min'])
  8753. A B C
  8754. sum 12.0 15.0 18.0
  8755. min 1.0 2.0 3.0
  8756. Different aggregations per column.
  8757. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
  8758. A B
  8759. sum 12.0 NaN
  8760. min 1.0 2.0
  8761. max NaN 8.0
  8762. Aggregate different functions over the columns and rename the index of the resulting
  8763. DataFrame.
  8764. >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean'))
  8765. A B C
  8766. x 7.0 NaN NaN
  8767. y NaN 2.0 NaN
  8768. z NaN NaN 6.0
  8769. Aggregate over the columns.
  8770. >>> df.agg("mean", axis="columns")
  8771. 0 2.0
  8772. 1 5.0
  8773. 2 8.0
  8774. 3 NaN
  8775. dtype: float64
  8776. """
  8777. )
  8778. @doc(
  8779. _shared_docs["aggregate"],
  8780. klass=_shared_doc_kwargs["klass"],
  8781. axis=_shared_doc_kwargs["axis"],
  8782. see_also=_agg_see_also_doc,
  8783. examples=_agg_examples_doc,
  8784. )
  8785. def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
  8786. from pandas.core.apply import frame_apply
  8787. axis = self._get_axis_number(axis)
  8788. op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
  8789. result = op.agg()
  8790. result = reconstruct_and_relabel_result(result, func, **kwargs)
  8791. return result
  8792. agg = aggregate
  8793. @doc(
  8794. _shared_docs["transform"],
  8795. klass=_shared_doc_kwargs["klass"],
  8796. axis=_shared_doc_kwargs["axis"],
  8797. )
  8798. def transform(
  8799. self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
  8800. ) -> DataFrame:
  8801. from pandas.core.apply import frame_apply
  8802. op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
  8803. result = op.transform()
  8804. assert isinstance(result, DataFrame)
  8805. return result
  8806. def apply(
  8807. self,
  8808. func: AggFuncType,
  8809. axis: Axis = 0,
  8810. raw: bool = False,
  8811. result_type: Literal["expand", "reduce", "broadcast"] | None = None,
  8812. args=(),
  8813. by_row: Literal[False, "compat"] = "compat",
  8814. engine: Literal["python", "numba"] = "python",
  8815. engine_kwargs: dict[str, bool] | None = None,
  8816. **kwargs,
  8817. ):
  8818. """
  8819. Apply a function along an axis of the DataFrame.
  8820. Objects passed to the function are Series objects whose index is
  8821. either the DataFrame's index (``axis=0``) or the DataFrame's columns
  8822. (``axis=1``). By default (``result_type=None``), the final return type
  8823. is inferred from the return type of the applied function. Otherwise,
  8824. it depends on the `result_type` argument.
  8825. Parameters
  8826. ----------
  8827. func : function
  8828. Function to apply to each column or row.
  8829. axis : {0 or 'index', 1 or 'columns'}, default 0
  8830. Axis along which the function is applied:
  8831. * 0 or 'index': apply function to each column.
  8832. * 1 or 'columns': apply function to each row.
  8833. raw : bool, default False
  8834. Determines if row or column is passed as a Series or ndarray object:
  8835. * ``False`` : passes each row or column as a Series to the
  8836. function.
  8837. * ``True`` : the passed function will receive ndarray objects
  8838. instead.
  8839. If you are just applying a NumPy reduction function this will
  8840. achieve much better performance.
  8841. result_type : {'expand', 'reduce', 'broadcast', None}, default None
  8842. These only act when ``axis=1`` (columns):
  8843. * 'expand' : list-like results will be turned into columns.
  8844. * 'reduce' : returns a Series if possible rather than expanding
  8845. list-like results. This is the opposite of 'expand'.
  8846. * 'broadcast' : results will be broadcast to the original shape
  8847. of the DataFrame, the original index and columns will be
  8848. retained.
  8849. The default behaviour (None) depends on the return value of the
  8850. applied function: list-like results will be returned as a Series
  8851. of those. However if the apply function returns a Series these
  8852. are expanded to columns.
  8853. args : tuple
  8854. Positional arguments to pass to `func` in addition to the
  8855. array/series.
  8856. by_row : False or "compat", default "compat"
  8857. Only has an effect when ``func`` is a listlike or dictlike of funcs
  8858. and the func isn't a string.
  8859. If "compat", will if possible first translate the func into pandas
  8860. methods (e.g. ``Series().apply(np.sum)`` will be translated to
  8861. ``Series().sum()``). If that doesn't work, will try call to apply again with
  8862. ``by_row=True`` and if that fails, will call apply again with
  8863. ``by_row=False`` (backward compatible).
  8864. If False, the funcs will be passed the whole Series at once.
  8865. .. versionadded:: 2.1.0
  8866. engine : {'python', 'numba'}, default 'python'
  8867. Choose between the python (default) engine or the numba engine in apply.
  8868. The numba engine will attempt to JIT compile the passed function,
  8869. which may result in speedups for large DataFrames.
  8870. It also supports the following engine_kwargs :
  8871. - nopython (compile the function in nopython mode)
  8872. - nogil (release the GIL inside the JIT compiled function)
  8873. - parallel (try to apply the function in parallel over the DataFrame)
  8874. Note: Due to limitations within numba/how pandas interfaces with numba,
  8875. you should only use this if raw=True
  8876. Note: The numba compiler only supports a subset of
  8877. valid Python/numpy operations.
  8878. Please read more about the `supported python features
  8879. <https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_
  8880. and `supported numpy features
  8881. <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
  8882. in numba to learn what you can or cannot use in the passed function.
  8883. .. versionadded:: 2.2.0
  8884. engine_kwargs : dict
  8885. Pass keyword arguments to the engine.
  8886. This is currently only used by the numba engine,
  8887. see the documentation for the engine argument for more information.
  8888. **kwargs
  8889. Additional keyword arguments to pass as keywords arguments to
  8890. `func`.
  8891. Returns
  8892. -------
  8893. Series or DataFrame
  8894. Result of applying ``func`` along the given axis of the
  8895. DataFrame.
  8896. See Also
  8897. --------
  8898. DataFrame.map: For elementwise operations.
  8899. DataFrame.aggregate: Only perform aggregating type operations.
  8900. DataFrame.transform: Only perform transforming type operations.
  8901. Notes
  8902. -----
  8903. Functions that mutate the passed object can produce unexpected
  8904. behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
  8905. for more details.
  8906. Examples
  8907. --------
  8908. >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
  8909. >>> df
  8910. A B
  8911. 0 4 9
  8912. 1 4 9
  8913. 2 4 9
  8914. Using a numpy universal function (in this case the same as
  8915. ``np.sqrt(df)``):
  8916. >>> df.apply(np.sqrt)
  8917. A B
  8918. 0 2.0 3.0
  8919. 1 2.0 3.0
  8920. 2 2.0 3.0
  8921. Using a reducing function on either axis
  8922. >>> df.apply(np.sum, axis=0)
  8923. A 12
  8924. B 27
  8925. dtype: int64
  8926. >>> df.apply(np.sum, axis=1)
  8927. 0 13
  8928. 1 13
  8929. 2 13
  8930. dtype: int64
  8931. Returning a list-like will result in a Series
  8932. >>> df.apply(lambda x: [1, 2], axis=1)
  8933. 0 [1, 2]
  8934. 1 [1, 2]
  8935. 2 [1, 2]
  8936. dtype: object
  8937. Passing ``result_type='expand'`` will expand list-like results
  8938. to columns of a Dataframe
  8939. >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
  8940. 0 1
  8941. 0 1 2
  8942. 1 1 2
  8943. 2 1 2
  8944. Returning a Series inside the function is similar to passing
  8945. ``result_type='expand'``. The resulting column names
  8946. will be the Series index.
  8947. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
  8948. foo bar
  8949. 0 1 2
  8950. 1 1 2
  8951. 2 1 2
  8952. Passing ``result_type='broadcast'`` will ensure the same shape
  8953. result, whether list-like or scalar is returned by the function,
  8954. and broadcast it along the axis. The resulting column names will
  8955. be the originals.
  8956. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
  8957. A B
  8958. 0 1 2
  8959. 1 1 2
  8960. 2 1 2
  8961. """
  8962. from pandas.core.apply import frame_apply
  8963. op = frame_apply(
  8964. self,
  8965. func=func,
  8966. axis=axis,
  8967. raw=raw,
  8968. result_type=result_type,
  8969. by_row=by_row,
  8970. engine=engine,
  8971. engine_kwargs=engine_kwargs,
  8972. args=args,
  8973. kwargs=kwargs,
  8974. )
  8975. return op.apply().__finalize__(self, method="apply")
  8976. def map(
  8977. self, func: PythonFuncType, na_action: str | None = None, **kwargs
  8978. ) -> DataFrame:
  8979. """
  8980. Apply a function to a Dataframe elementwise.
  8981. .. versionadded:: 2.1.0
  8982. DataFrame.applymap was deprecated and renamed to DataFrame.map.
  8983. This method applies a function that accepts and returns a scalar
  8984. to every element of a DataFrame.
  8985. Parameters
  8986. ----------
  8987. func : callable
  8988. Python function, returns a single value from a single value.
  8989. na_action : {None, 'ignore'}, default None
  8990. If 'ignore', propagate NaN values, without passing them to func.
  8991. **kwargs
  8992. Additional keyword arguments to pass as keywords arguments to
  8993. `func`.
  8994. Returns
  8995. -------
  8996. DataFrame
  8997. Transformed DataFrame.
  8998. See Also
  8999. --------
  9000. DataFrame.apply : Apply a function along input axis of DataFrame.
  9001. DataFrame.replace: Replace values given in `to_replace` with `value`.
  9002. Series.map : Apply a function elementwise on a Series.
  9003. Examples
  9004. --------
  9005. >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
  9006. >>> df
  9007. 0 1
  9008. 0 1.000 2.120
  9009. 1 3.356 4.567
  9010. >>> df.map(lambda x: len(str(x)))
  9011. 0 1
  9012. 0 3 4
  9013. 1 5 5
  9014. Like Series.map, NA values can be ignored:
  9015. >>> df_copy = df.copy()
  9016. >>> df_copy.iloc[0, 0] = pd.NA
  9017. >>> df_copy.map(lambda x: len(str(x)), na_action='ignore')
  9018. 0 1
  9019. 0 NaN 4
  9020. 1 5.0 5
  9021. It is also possible to use `map` with functions that are not
  9022. `lambda` functions:
  9023. >>> df.map(round, ndigits=1)
  9024. 0 1
  9025. 0 1.0 2.1
  9026. 1 3.4 4.6
  9027. Note that a vectorized version of `func` often exists, which will
  9028. be much faster. You could square each number elementwise.
  9029. >>> df.map(lambda x: x**2)
  9030. 0 1
  9031. 0 1.000000 4.494400
  9032. 1 11.262736 20.857489
  9033. But it's better to avoid map in that case.
  9034. >>> df ** 2
  9035. 0 1
  9036. 0 1.000000 4.494400
  9037. 1 11.262736 20.857489
  9038. """
  9039. if na_action not in {"ignore", None}:
  9040. raise ValueError(
  9041. f"na_action must be 'ignore' or None. Got {repr(na_action)}"
  9042. )
  9043. if self.empty:
  9044. return self.copy()
  9045. func = functools.partial(func, **kwargs)
  9046. def infer(x):
  9047. return x._map_values(func, na_action=na_action)
  9048. return self.apply(infer).__finalize__(self, "map")
  9049. def applymap(
  9050. self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs
  9051. ) -> DataFrame:
  9052. """
  9053. Apply a function to a Dataframe elementwise.
  9054. .. deprecated:: 2.1.0
  9055. DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  9056. This method applies a function that accepts and returns a scalar
  9057. to every element of a DataFrame.
  9058. Parameters
  9059. ----------
  9060. func : callable
  9061. Python function, returns a single value from a single value.
  9062. na_action : {None, 'ignore'}, default None
  9063. If 'ignore', propagate NaN values, without passing them to func.
  9064. **kwargs
  9065. Additional keyword arguments to pass as keywords arguments to
  9066. `func`.
  9067. Returns
  9068. -------
  9069. DataFrame
  9070. Transformed DataFrame.
  9071. See Also
  9072. --------
  9073. DataFrame.apply : Apply a function along input axis of DataFrame.
  9074. DataFrame.map : Apply a function along input axis of DataFrame.
  9075. DataFrame.replace: Replace values given in `to_replace` with `value`.
  9076. Examples
  9077. --------
  9078. >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
  9079. >>> df
  9080. 0 1
  9081. 0 1.000 2.120
  9082. 1 3.356 4.567
  9083. >>> df.map(lambda x: len(str(x)))
  9084. 0 1
  9085. 0 3 4
  9086. 1 5 5
  9087. """
  9088. warnings.warn(
  9089. "DataFrame.applymap has been deprecated. Use DataFrame.map instead.",
  9090. FutureWarning,
  9091. stacklevel=find_stack_level(),
  9092. )
  9093. return self.map(func, na_action=na_action, **kwargs)
  9094. # ----------------------------------------------------------------------
  9095. # Merging / joining methods
  9096. def _append(
  9097. self,
  9098. other,
  9099. ignore_index: bool = False,
  9100. verify_integrity: bool = False,
  9101. sort: bool = False,
  9102. ) -> DataFrame:
  9103. if isinstance(other, (Series, dict)):
  9104. if isinstance(other, dict):
  9105. if not ignore_index:
  9106. raise TypeError("Can only append a dict if ignore_index=True")
  9107. other = Series(other)
  9108. if other.name is None and not ignore_index:
  9109. raise TypeError(
  9110. "Can only append a Series if ignore_index=True "
  9111. "or if the Series has a name"
  9112. )
  9113. index = Index(
  9114. [other.name],
  9115. name=self.index.names
  9116. if isinstance(self.index, MultiIndex)
  9117. else self.index.name,
  9118. )
  9119. row_df = other.to_frame().T
  9120. # infer_objects is needed for
  9121. # test_append_empty_frame_to_series_with_dateutil_tz
  9122. other = row_df.infer_objects(copy=False).rename_axis(
  9123. index.names, copy=False
  9124. )
  9125. elif isinstance(other, list):
  9126. if not other:
  9127. pass
  9128. elif not isinstance(other[0], DataFrame):
  9129. other = DataFrame(other)
  9130. if self.index.name is not None and not ignore_index:
  9131. other.index.name = self.index.name
  9132. from pandas.core.reshape.concat import concat
  9133. if isinstance(other, (list, tuple)):
  9134. to_concat = [self, *other]
  9135. else:
  9136. to_concat = [self, other]
  9137. result = concat(
  9138. to_concat,
  9139. ignore_index=ignore_index,
  9140. verify_integrity=verify_integrity,
  9141. sort=sort,
  9142. )
  9143. return result.__finalize__(self, method="append")
  9144. def join(
  9145. self,
  9146. other: DataFrame | Series | Iterable[DataFrame | Series],
  9147. on: IndexLabel | None = None,
  9148. how: MergeHow = "left",
  9149. lsuffix: str = "",
  9150. rsuffix: str = "",
  9151. sort: bool = False,
  9152. validate: JoinValidate | None = None,
  9153. ) -> DataFrame:
  9154. """
  9155. Join columns of another DataFrame.
  9156. Join columns with `other` DataFrame either on index or on a key
  9157. column. Efficiently join multiple DataFrame objects by index at once by
  9158. passing a list.
  9159. Parameters
  9160. ----------
  9161. other : DataFrame, Series, or a list containing any combination of them
  9162. Index should be similar to one of the columns in this one. If a
  9163. Series is passed, its name attribute must be set, and that will be
  9164. used as the column name in the resulting joined DataFrame.
  9165. on : str, list of str, or array-like, optional
  9166. Column or index level name(s) in the caller to join on the index
  9167. in `other`, otherwise joins index-on-index. If multiple
  9168. values given, the `other` DataFrame must have a MultiIndex. Can
  9169. pass an array as the join key if it is not already contained in
  9170. the calling DataFrame. Like an Excel VLOOKUP operation.
  9171. how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
  9172. How to handle the operation of the two objects.
  9173. * left: use calling frame's index (or column if on is specified)
  9174. * right: use `other`'s index.
  9175. * outer: form union of calling frame's index (or column if on is
  9176. specified) with `other`'s index, and sort it lexicographically.
  9177. * inner: form intersection of calling frame's index (or column if
  9178. on is specified) with `other`'s index, preserving the order
  9179. of the calling's one.
  9180. * cross: creates the cartesian product from both frames, preserves the order
  9181. of the left keys.
  9182. lsuffix : str, default ''
  9183. Suffix to use from left frame's overlapping columns.
  9184. rsuffix : str, default ''
  9185. Suffix to use from right frame's overlapping columns.
  9186. sort : bool, default False
  9187. Order result DataFrame lexicographically by the join key. If False,
  9188. the order of the join key depends on the join type (how keyword).
  9189. validate : str, optional
  9190. If specified, checks if join is of specified type.
  9191. * "one_to_one" or "1:1": check if join keys are unique in both left
  9192. and right datasets.
  9193. * "one_to_many" or "1:m": check if join keys are unique in left dataset.
  9194. * "many_to_one" or "m:1": check if join keys are unique in right dataset.
  9195. * "many_to_many" or "m:m": allowed, but does not result in checks.
  9196. .. versionadded:: 1.5.0
  9197. Returns
  9198. -------
  9199. DataFrame
  9200. A dataframe containing columns from both the caller and `other`.
  9201. See Also
  9202. --------
  9203. DataFrame.merge : For column(s)-on-column(s) operations.
  9204. Notes
  9205. -----
  9206. Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
  9207. passing a list of `DataFrame` objects.
  9208. Examples
  9209. --------
  9210. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
  9211. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  9212. >>> df
  9213. key A
  9214. 0 K0 A0
  9215. 1 K1 A1
  9216. 2 K2 A2
  9217. 3 K3 A3
  9218. 4 K4 A4
  9219. 5 K5 A5
  9220. >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
  9221. ... 'B': ['B0', 'B1', 'B2']})
  9222. >>> other
  9223. key B
  9224. 0 K0 B0
  9225. 1 K1 B1
  9226. 2 K2 B2
  9227. Join DataFrames using their indexes.
  9228. >>> df.join(other, lsuffix='_caller', rsuffix='_other')
  9229. key_caller A key_other B
  9230. 0 K0 A0 K0 B0
  9231. 1 K1 A1 K1 B1
  9232. 2 K2 A2 K2 B2
  9233. 3 K3 A3 NaN NaN
  9234. 4 K4 A4 NaN NaN
  9235. 5 K5 A5 NaN NaN
  9236. If we want to join using the key columns, we need to set key to be
  9237. the index in both `df` and `other`. The joined DataFrame will have
  9238. key as its index.
  9239. >>> df.set_index('key').join(other.set_index('key'))
  9240. A B
  9241. key
  9242. K0 A0 B0
  9243. K1 A1 B1
  9244. K2 A2 B2
  9245. K3 A3 NaN
  9246. K4 A4 NaN
  9247. K5 A5 NaN
  9248. Another option to join using the key columns is to use the `on`
  9249. parameter. DataFrame.join always uses `other`'s index but we can use
  9250. any column in `df`. This method preserves the original DataFrame's
  9251. index in the result.
  9252. >>> df.join(other.set_index('key'), on='key')
  9253. key A B
  9254. 0 K0 A0 B0
  9255. 1 K1 A1 B1
  9256. 2 K2 A2 B2
  9257. 3 K3 A3 NaN
  9258. 4 K4 A4 NaN
  9259. 5 K5 A5 NaN
  9260. Using non-unique key values shows how they are matched.
  9261. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
  9262. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  9263. >>> df
  9264. key A
  9265. 0 K0 A0
  9266. 1 K1 A1
  9267. 2 K1 A2
  9268. 3 K3 A3
  9269. 4 K0 A4
  9270. 5 K1 A5
  9271. >>> df.join(other.set_index('key'), on='key', validate='m:1')
  9272. key A B
  9273. 0 K0 A0 B0
  9274. 1 K1 A1 B1
  9275. 2 K1 A2 B1
  9276. 3 K3 A3 NaN
  9277. 4 K0 A4 B0
  9278. 5 K1 A5 B1
  9279. """
  9280. from pandas.core.reshape.concat import concat
  9281. from pandas.core.reshape.merge import merge
  9282. if isinstance(other, Series):
  9283. if other.name is None:
  9284. raise ValueError("Other Series must have a name")
  9285. other = DataFrame({other.name: other})
  9286. if isinstance(other, DataFrame):
  9287. if how == "cross":
  9288. return merge(
  9289. self,
  9290. other,
  9291. how=how,
  9292. on=on,
  9293. suffixes=(lsuffix, rsuffix),
  9294. sort=sort,
  9295. validate=validate,
  9296. )
  9297. return merge(
  9298. self,
  9299. other,
  9300. left_on=on,
  9301. how=how,
  9302. left_index=on is None,
  9303. right_index=True,
  9304. suffixes=(lsuffix, rsuffix),
  9305. sort=sort,
  9306. validate=validate,
  9307. )
  9308. else:
  9309. if on is not None:
  9310. raise ValueError(
  9311. "Joining multiple DataFrames only supported for joining on index"
  9312. )
  9313. if rsuffix or lsuffix:
  9314. raise ValueError(
  9315. "Suffixes not supported when joining multiple DataFrames"
  9316. )
  9317. # Mypy thinks the RHS is a
  9318. # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
  9319. # the LHS is an "Iterable[DataFrame]", but in reality both types are
  9320. # "Iterable[Union[DataFrame, Series]]" due to the if statements
  9321. frames = [cast("DataFrame | Series", self)] + list(other)
  9322. can_concat = all(df.index.is_unique for df in frames)
  9323. # join indexes only using concat
  9324. if can_concat:
  9325. if how == "left":
  9326. res = concat(
  9327. frames, axis=1, join="outer", verify_integrity=True, sort=sort
  9328. )
  9329. return res.reindex(self.index, copy=False)
  9330. else:
  9331. return concat(
  9332. frames, axis=1, join=how, verify_integrity=True, sort=sort
  9333. )
  9334. joined = frames[0]
  9335. for frame in frames[1:]:
  9336. joined = merge(
  9337. joined,
  9338. frame,
  9339. how=how,
  9340. left_index=True,
  9341. right_index=True,
  9342. validate=validate,
  9343. )
  9344. return joined
  9345. @Substitution("")
  9346. @Appender(_merge_doc, indents=2)
  9347. def merge(
  9348. self,
  9349. right: DataFrame | Series,
  9350. how: MergeHow = "inner",
  9351. on: IndexLabel | AnyArrayLike | None = None,
  9352. left_on: IndexLabel | AnyArrayLike | None = None,
  9353. right_on: IndexLabel | AnyArrayLike | None = None,
  9354. left_index: bool = False,
  9355. right_index: bool = False,
  9356. sort: bool = False,
  9357. suffixes: Suffixes = ("_x", "_y"),
  9358. copy: bool | None = None,
  9359. indicator: str | bool = False,
  9360. validate: MergeValidate | None = None,
  9361. ) -> DataFrame:
  9362. from pandas.core.reshape.merge import merge
  9363. return merge(
  9364. self,
  9365. right,
  9366. how=how,
  9367. on=on,
  9368. left_on=left_on,
  9369. right_on=right_on,
  9370. left_index=left_index,
  9371. right_index=right_index,
  9372. sort=sort,
  9373. suffixes=suffixes,
  9374. copy=copy,
  9375. indicator=indicator,
  9376. validate=validate,
  9377. )
  9378. def round(
  9379. self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
  9380. ) -> DataFrame:
  9381. """
  9382. Round a DataFrame to a variable number of decimal places.
  9383. Parameters
  9384. ----------
  9385. decimals : int, dict, Series
  9386. Number of decimal places to round each column to. If an int is
  9387. given, round each column to the same number of places.
  9388. Otherwise dict and Series round to variable numbers of places.
  9389. Column names should be in the keys if `decimals` is a
  9390. dict-like, or in the index if `decimals` is a Series. Any
  9391. columns not included in `decimals` will be left as is. Elements
  9392. of `decimals` which are not columns of the input will be
  9393. ignored.
  9394. *args
  9395. Additional keywords have no effect but might be accepted for
  9396. compatibility with numpy.
  9397. **kwargs
  9398. Additional keywords have no effect but might be accepted for
  9399. compatibility with numpy.
  9400. Returns
  9401. -------
  9402. DataFrame
  9403. A DataFrame with the affected columns rounded to the specified
  9404. number of decimal places.
  9405. See Also
  9406. --------
  9407. numpy.around : Round a numpy array to the given number of decimals.
  9408. Series.round : Round a Series to the given number of decimals.
  9409. Examples
  9410. --------
  9411. >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
  9412. ... columns=['dogs', 'cats'])
  9413. >>> df
  9414. dogs cats
  9415. 0 0.21 0.32
  9416. 1 0.01 0.67
  9417. 2 0.66 0.03
  9418. 3 0.21 0.18
  9419. By providing an integer each column is rounded to the same number
  9420. of decimal places
  9421. >>> df.round(1)
  9422. dogs cats
  9423. 0 0.2 0.3
  9424. 1 0.0 0.7
  9425. 2 0.7 0.0
  9426. 3 0.2 0.2
  9427. With a dict, the number of places for specific columns can be
  9428. specified with the column names as key and the number of decimal
  9429. places as value
  9430. >>> df.round({'dogs': 1, 'cats': 0})
  9431. dogs cats
  9432. 0 0.2 0.0
  9433. 1 0.0 1.0
  9434. 2 0.7 0.0
  9435. 3 0.2 0.0
  9436. Using a Series, the number of places for specific columns can be
  9437. specified with the column names as index and the number of
  9438. decimal places as value
  9439. >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
  9440. >>> df.round(decimals)
  9441. dogs cats
  9442. 0 0.2 0.0
  9443. 1 0.0 1.0
  9444. 2 0.7 0.0
  9445. 3 0.2 0.0
  9446. """
  9447. from pandas.core.reshape.concat import concat
  9448. def _dict_round(df: DataFrame, decimals):
  9449. for col, vals in df.items():
  9450. try:
  9451. yield _series_round(vals, decimals[col])
  9452. except KeyError:
  9453. yield vals
  9454. def _series_round(ser: Series, decimals: int) -> Series:
  9455. if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
  9456. return ser.round(decimals)
  9457. return ser
  9458. nv.validate_round(args, kwargs)
  9459. if isinstance(decimals, (dict, Series)):
  9460. if isinstance(decimals, Series) and not decimals.index.is_unique:
  9461. raise ValueError("Index of decimals must be unique")
  9462. if is_dict_like(decimals) and not all(
  9463. is_integer(value) for _, value in decimals.items()
  9464. ):
  9465. raise TypeError("Values in decimals must be integers")
  9466. new_cols = list(_dict_round(self, decimals))
  9467. elif is_integer(decimals):
  9468. # Dispatch to Block.round
  9469. # Argument "decimals" to "round" of "BaseBlockManager" has incompatible
  9470. # type "Union[int, integer[Any]]"; expected "int"
  9471. new_mgr = self._mgr.round(
  9472. decimals=decimals, # type: ignore[arg-type]
  9473. using_cow=using_copy_on_write(),
  9474. )
  9475. return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
  9476. self, method="round"
  9477. )
  9478. else:
  9479. raise TypeError("decimals must be an integer, a dict-like or a Series")
  9480. if new_cols is not None and len(new_cols) > 0:
  9481. return self._constructor(
  9482. concat(new_cols, axis=1), index=self.index, columns=self.columns
  9483. ).__finalize__(self, method="round")
  9484. else:
  9485. return self.copy(deep=False)
  9486. # ----------------------------------------------------------------------
  9487. # Statistical methods, etc.
  9488. def corr(
  9489. self,
  9490. method: CorrelationMethod = "pearson",
  9491. min_periods: int = 1,
  9492. numeric_only: bool = False,
  9493. ) -> DataFrame:
  9494. """
  9495. Compute pairwise correlation of columns, excluding NA/null values.
  9496. Parameters
  9497. ----------
  9498. method : {'pearson', 'kendall', 'spearman'} or callable
  9499. Method of correlation:
  9500. * pearson : standard correlation coefficient
  9501. * kendall : Kendall Tau correlation coefficient
  9502. * spearman : Spearman rank correlation
  9503. * callable: callable with input two 1d ndarrays
  9504. and returning a float. Note that the returned matrix from corr
  9505. will have 1 along the diagonals and will be symmetric
  9506. regardless of the callable's behavior.
  9507. min_periods : int, optional
  9508. Minimum number of observations required per pair of columns
  9509. to have a valid result. Currently only available for Pearson
  9510. and Spearman correlation.
  9511. numeric_only : bool, default False
  9512. Include only `float`, `int` or `boolean` data.
  9513. .. versionadded:: 1.5.0
  9514. .. versionchanged:: 2.0.0
  9515. The default value of ``numeric_only`` is now ``False``.
  9516. Returns
  9517. -------
  9518. DataFrame
  9519. Correlation matrix.
  9520. See Also
  9521. --------
  9522. DataFrame.corrwith : Compute pairwise correlation with another
  9523. DataFrame or Series.
  9524. Series.corr : Compute the correlation between two Series.
  9525. Notes
  9526. -----
  9527. Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
  9528. * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
  9529. * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
  9530. * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
  9531. Examples
  9532. --------
  9533. >>> def histogram_intersection(a, b):
  9534. ... v = np.minimum(a, b).sum().round(decimals=1)
  9535. ... return v
  9536. >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
  9537. ... columns=['dogs', 'cats'])
  9538. >>> df.corr(method=histogram_intersection)
  9539. dogs cats
  9540. dogs 1.0 0.3
  9541. cats 0.3 1.0
  9542. >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
  9543. ... columns=['dogs', 'cats'])
  9544. >>> df.corr(min_periods=3)
  9545. dogs cats
  9546. dogs 1.0 NaN
  9547. cats NaN 1.0
  9548. """ # noqa: E501
  9549. data = self._get_numeric_data() if numeric_only else self
  9550. cols = data.columns
  9551. idx = cols.copy()
  9552. mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
  9553. if method == "pearson":
  9554. correl = libalgos.nancorr(mat, minp=min_periods)
  9555. elif method == "spearman":
  9556. correl = libalgos.nancorr_spearman(mat, minp=min_periods)
  9557. elif method == "kendall" or callable(method):
  9558. if min_periods is None:
  9559. min_periods = 1
  9560. mat = mat.T
  9561. corrf = nanops.get_corr_func(method)
  9562. K = len(cols)
  9563. correl = np.empty((K, K), dtype=float)
  9564. mask = np.isfinite(mat)
  9565. for i, ac in enumerate(mat):
  9566. for j, bc in enumerate(mat):
  9567. if i > j:
  9568. continue
  9569. valid = mask[i] & mask[j]
  9570. if valid.sum() < min_periods:
  9571. c = np.nan
  9572. elif i == j:
  9573. c = 1.0
  9574. elif not valid.all():
  9575. c = corrf(ac[valid], bc[valid])
  9576. else:
  9577. c = corrf(ac, bc)
  9578. correl[i, j] = c
  9579. correl[j, i] = c
  9580. else:
  9581. raise ValueError(
  9582. "method must be either 'pearson', "
  9583. "'spearman', 'kendall', or a callable, "
  9584. f"'{method}' was supplied"
  9585. )
  9586. result = self._constructor(correl, index=idx, columns=cols, copy=False)
  9587. return result.__finalize__(self, method="corr")
  9588. def cov(
  9589. self,
  9590. min_periods: int | None = None,
  9591. ddof: int | None = 1,
  9592. numeric_only: bool = False,
  9593. ) -> DataFrame:
  9594. """
  9595. Compute pairwise covariance of columns, excluding NA/null values.
  9596. Compute the pairwise covariance among the series of a DataFrame.
  9597. The returned data frame is the `covariance matrix
  9598. <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
  9599. of the DataFrame.
  9600. Both NA and null values are automatically excluded from the
  9601. calculation. (See the note below about bias from missing values.)
  9602. A threshold can be set for the minimum number of
  9603. observations for each value created. Comparisons with observations
  9604. below this threshold will be returned as ``NaN``.
  9605. This method is generally used for the analysis of time series data to
  9606. understand the relationship between different measures
  9607. across time.
  9608. Parameters
  9609. ----------
  9610. min_periods : int, optional
  9611. Minimum number of observations required per pair of columns
  9612. to have a valid result.
  9613. ddof : int, default 1
  9614. Delta degrees of freedom. The divisor used in calculations
  9615. is ``N - ddof``, where ``N`` represents the number of elements.
  9616. This argument is applicable only when no ``nan`` is in the dataframe.
  9617. numeric_only : bool, default False
  9618. Include only `float`, `int` or `boolean` data.
  9619. .. versionadded:: 1.5.0
  9620. .. versionchanged:: 2.0.0
  9621. The default value of ``numeric_only`` is now ``False``.
  9622. Returns
  9623. -------
  9624. DataFrame
  9625. The covariance matrix of the series of the DataFrame.
  9626. See Also
  9627. --------
  9628. Series.cov : Compute covariance with another Series.
  9629. core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
  9630. covariance.
  9631. core.window.expanding.Expanding.cov : Expanding sample covariance.
  9632. core.window.rolling.Rolling.cov : Rolling sample covariance.
  9633. Notes
  9634. -----
  9635. Returns the covariance matrix of the DataFrame's time series.
  9636. The covariance is normalized by N-ddof.
  9637. For DataFrames that have Series that are missing data (assuming that
  9638. data is `missing at random
  9639. <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
  9640. the returned covariance matrix will be an unbiased estimate
  9641. of the variance and covariance between the member Series.
  9642. However, for many applications this estimate may not be acceptable
  9643. because the estimate covariance matrix is not guaranteed to be positive
  9644. semi-definite. This could lead to estimate correlations having
  9645. absolute values which are greater than one, and/or a non-invertible
  9646. covariance matrix. See `Estimation of covariance matrices
  9647. <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
  9648. matrices>`__ for more details.
  9649. Examples
  9650. --------
  9651. >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
  9652. ... columns=['dogs', 'cats'])
  9653. >>> df.cov()
  9654. dogs cats
  9655. dogs 0.666667 -1.000000
  9656. cats -1.000000 1.666667
  9657. >>> np.random.seed(42)
  9658. >>> df = pd.DataFrame(np.random.randn(1000, 5),
  9659. ... columns=['a', 'b', 'c', 'd', 'e'])
  9660. >>> df.cov()
  9661. a b c d e
  9662. a 0.998438 -0.020161 0.059277 -0.008943 0.014144
  9663. b -0.020161 1.059352 -0.008543 -0.024738 0.009826
  9664. c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
  9665. d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
  9666. e 0.014144 0.009826 -0.000271 -0.013692 0.977795
  9667. **Minimum number of periods**
  9668. This method also supports an optional ``min_periods`` keyword
  9669. that specifies the required minimum number of non-NA observations for
  9670. each column pair in order to have a valid result:
  9671. >>> np.random.seed(42)
  9672. >>> df = pd.DataFrame(np.random.randn(20, 3),
  9673. ... columns=['a', 'b', 'c'])
  9674. >>> df.loc[df.index[:5], 'a'] = np.nan
  9675. >>> df.loc[df.index[5:10], 'b'] = np.nan
  9676. >>> df.cov(min_periods=12)
  9677. a b c
  9678. a 0.316741 NaN -0.150812
  9679. b NaN 1.248003 0.191417
  9680. c -0.150812 0.191417 0.895202
  9681. """
  9682. data = self._get_numeric_data() if numeric_only else self
  9683. cols = data.columns
  9684. idx = cols.copy()
  9685. mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
  9686. if notna(mat).all():
  9687. if min_periods is not None and min_periods > len(mat):
  9688. base_cov = np.empty((mat.shape[1], mat.shape[1]))
  9689. base_cov.fill(np.nan)
  9690. else:
  9691. base_cov = np.cov(mat.T, ddof=ddof)
  9692. base_cov = base_cov.reshape((len(cols), len(cols)))
  9693. else:
  9694. base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
  9695. result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
  9696. return result.__finalize__(self, method="cov")
  9697. def corrwith(
  9698. self,
  9699. other: DataFrame | Series,
  9700. axis: Axis = 0,
  9701. drop: bool = False,
  9702. method: CorrelationMethod = "pearson",
  9703. numeric_only: bool = False,
  9704. ) -> Series:
  9705. """
  9706. Compute pairwise correlation.
  9707. Pairwise correlation is computed between rows or columns of
  9708. DataFrame with rows or columns of Series or DataFrame. DataFrames
  9709. are first aligned along both axes before computing the
  9710. correlations.
  9711. Parameters
  9712. ----------
  9713. other : DataFrame, Series
  9714. Object with which to compute correlations.
  9715. axis : {0 or 'index', 1 or 'columns'}, default 0
  9716. The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
  9717. column-wise.
  9718. drop : bool, default False
  9719. Drop missing indices from result.
  9720. method : {'pearson', 'kendall', 'spearman'} or callable
  9721. Method of correlation:
  9722. * pearson : standard correlation coefficient
  9723. * kendall : Kendall Tau correlation coefficient
  9724. * spearman : Spearman rank correlation
  9725. * callable: callable with input two 1d ndarrays
  9726. and returning a float.
  9727. numeric_only : bool, default False
  9728. Include only `float`, `int` or `boolean` data.
  9729. .. versionadded:: 1.5.0
  9730. .. versionchanged:: 2.0.0
  9731. The default value of ``numeric_only`` is now ``False``.
  9732. Returns
  9733. -------
  9734. Series
  9735. Pairwise correlations.
  9736. See Also
  9737. --------
  9738. DataFrame.corr : Compute pairwise correlation of columns.
  9739. Examples
  9740. --------
  9741. >>> index = ["a", "b", "c", "d", "e"]
  9742. >>> columns = ["one", "two", "three", "four"]
  9743. >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
  9744. >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
  9745. >>> df1.corrwith(df2)
  9746. one 1.0
  9747. two 1.0
  9748. three 1.0
  9749. four 1.0
  9750. dtype: float64
  9751. >>> df2.corrwith(df1, axis=1)
  9752. a 1.0
  9753. b 1.0
  9754. c 1.0
  9755. d 1.0
  9756. e NaN
  9757. dtype: float64
  9758. """ # noqa: E501
  9759. axis = self._get_axis_number(axis)
  9760. this = self._get_numeric_data() if numeric_only else self
  9761. if isinstance(other, Series):
  9762. return this.apply(lambda x: other.corr(x, method=method), axis=axis)
  9763. if numeric_only:
  9764. other = other._get_numeric_data()
  9765. left, right = this.align(other, join="inner", copy=False)
  9766. if axis == 1:
  9767. left = left.T
  9768. right = right.T
  9769. if method == "pearson":
  9770. # mask missing values
  9771. left = left + right * 0
  9772. right = right + left * 0
  9773. # demeaned data
  9774. ldem = left - left.mean(numeric_only=numeric_only)
  9775. rdem = right - right.mean(numeric_only=numeric_only)
  9776. num = (ldem * rdem).sum()
  9777. dom = (
  9778. (left.count() - 1)
  9779. * left.std(numeric_only=numeric_only)
  9780. * right.std(numeric_only=numeric_only)
  9781. )
  9782. correl = num / dom
  9783. elif method in ["kendall", "spearman"] or callable(method):
  9784. def c(x):
  9785. return nanops.nancorr(x[0], x[1], method=method)
  9786. correl = self._constructor_sliced(
  9787. map(c, zip(left.values.T, right.values.T)),
  9788. index=left.columns,
  9789. copy=False,
  9790. )
  9791. else:
  9792. raise ValueError(
  9793. f"Invalid method {method} was passed, "
  9794. "valid methods are: 'pearson', 'kendall', "
  9795. "'spearman', or callable"
  9796. )
  9797. if not drop:
  9798. # Find non-matching labels along the given axis
  9799. # and append missing correlations (GH 22375)
  9800. raxis: AxisInt = 1 if axis == 0 else 0
  9801. result_index = this._get_axis(raxis).union(other._get_axis(raxis))
  9802. idx_diff = result_index.difference(correl.index)
  9803. if len(idx_diff) > 0:
  9804. correl = correl._append(
  9805. Series([np.nan] * len(idx_diff), index=idx_diff)
  9806. )
  9807. return correl
  9808. # ----------------------------------------------------------------------
  9809. # ndarray-like stats methods
  9810. def count(self, axis: Axis = 0, numeric_only: bool = False):
  9811. """
  9812. Count non-NA cells for each column or row.
  9813. The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA.
  9814. Parameters
  9815. ----------
  9816. axis : {0 or 'index', 1 or 'columns'}, default 0
  9817. If 0 or 'index' counts are generated for each column.
  9818. If 1 or 'columns' counts are generated for each row.
  9819. numeric_only : bool, default False
  9820. Include only `float`, `int` or `boolean` data.
  9821. Returns
  9822. -------
  9823. Series
  9824. For each column/row the number of non-NA/null entries.
  9825. See Also
  9826. --------
  9827. Series.count: Number of non-NA elements in a Series.
  9828. DataFrame.value_counts: Count unique combinations of columns.
  9829. DataFrame.shape: Number of DataFrame rows and columns (including NA
  9830. elements).
  9831. DataFrame.isna: Boolean same-sized DataFrame showing places of NA
  9832. elements.
  9833. Examples
  9834. --------
  9835. Constructing DataFrame from a dictionary:
  9836. >>> df = pd.DataFrame({"Person":
  9837. ... ["John", "Myla", "Lewis", "John", "Myla"],
  9838. ... "Age": [24., np.nan, 21., 33, 26],
  9839. ... "Single": [False, True, True, True, False]})
  9840. >>> df
  9841. Person Age Single
  9842. 0 John 24.0 False
  9843. 1 Myla NaN True
  9844. 2 Lewis 21.0 True
  9845. 3 John 33.0 True
  9846. 4 Myla 26.0 False
  9847. Notice the uncounted NA values:
  9848. >>> df.count()
  9849. Person 5
  9850. Age 4
  9851. Single 5
  9852. dtype: int64
  9853. Counts for each **row**:
  9854. >>> df.count(axis='columns')
  9855. 0 3
  9856. 1 2
  9857. 2 3
  9858. 3 3
  9859. 4 3
  9860. dtype: int64
  9861. """
  9862. axis = self._get_axis_number(axis)
  9863. if numeric_only:
  9864. frame = self._get_numeric_data()
  9865. else:
  9866. frame = self
  9867. # GH #423
  9868. if len(frame._get_axis(axis)) == 0:
  9869. result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
  9870. else:
  9871. result = notna(frame).sum(axis=axis)
  9872. return result.astype("int64", copy=False).__finalize__(self, method="count")
  9873. def _reduce(
  9874. self,
  9875. op,
  9876. name: str,
  9877. *,
  9878. axis: Axis = 0,
  9879. skipna: bool = True,
  9880. numeric_only: bool = False,
  9881. filter_type=None,
  9882. **kwds,
  9883. ):
  9884. assert filter_type is None or filter_type == "bool", filter_type
  9885. out_dtype = "bool" if filter_type == "bool" else None
  9886. if axis is not None:
  9887. axis = self._get_axis_number(axis)
  9888. def func(values: np.ndarray):
  9889. # We only use this in the case that operates on self.values
  9890. return op(values, axis=axis, skipna=skipna, **kwds)
  9891. dtype_has_keepdims: dict[ExtensionDtype, bool] = {}
  9892. def blk_func(values, axis: Axis = 1):
  9893. if isinstance(values, ExtensionArray):
  9894. if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
  9895. self._mgr, ArrayManager
  9896. ):
  9897. return values._reduce(name, axis=1, skipna=skipna, **kwds)
  9898. has_keepdims = dtype_has_keepdims.get(values.dtype)
  9899. if has_keepdims is None:
  9900. sign = signature(values._reduce)
  9901. has_keepdims = "keepdims" in sign.parameters
  9902. dtype_has_keepdims[values.dtype] = has_keepdims
  9903. if has_keepdims:
  9904. return values._reduce(name, skipna=skipna, keepdims=True, **kwds)
  9905. else:
  9906. warnings.warn(
  9907. f"{type(values)}._reduce will require a `keepdims` parameter "
  9908. "in the future",
  9909. FutureWarning,
  9910. stacklevel=find_stack_level(),
  9911. )
  9912. result = values._reduce(name, skipna=skipna, **kwds)
  9913. return np.array([result])
  9914. else:
  9915. return op(values, axis=axis, skipna=skipna, **kwds)
  9916. def _get_data() -> DataFrame:
  9917. if filter_type is None:
  9918. data = self._get_numeric_data()
  9919. else:
  9920. # GH#25101, GH#24434
  9921. assert filter_type == "bool"
  9922. data = self._get_bool_data()
  9923. return data
  9924. # Case with EAs see GH#35881
  9925. df = self
  9926. if numeric_only:
  9927. df = _get_data()
  9928. if axis is None:
  9929. dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
  9930. if isinstance(dtype, ExtensionDtype):
  9931. df = df.astype(dtype, copy=False)
  9932. arr = concat_compat(list(df._iter_column_arrays()))
  9933. return arr._reduce(name, skipna=skipna, keepdims=False, **kwds)
  9934. return func(df.values)
  9935. elif axis == 1:
  9936. if len(df.index) == 0:
  9937. # Taking a transpose would result in no columns, losing the dtype.
  9938. # In the empty case, reducing along axis 0 or 1 gives the same
  9939. # result dtype, so reduce with axis=0 and ignore values
  9940. result = df._reduce(
  9941. op,
  9942. name,
  9943. axis=0,
  9944. skipna=skipna,
  9945. numeric_only=False,
  9946. filter_type=filter_type,
  9947. **kwds,
  9948. ).iloc[:0]
  9949. result.index = df.index
  9950. return result
  9951. # kurtosis excluded since groupby does not implement it
  9952. if df.shape[1] and name != "kurt":
  9953. dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
  9954. if isinstance(dtype, ExtensionDtype):
  9955. # GH 54341: fastpath for EA-backed axis=1 reductions
  9956. # This flattens the frame into a single 1D array while keeping
  9957. # track of the row and column indices of the original frame. Once
  9958. # flattened, grouping by the row indices and aggregating should
  9959. # be equivalent to transposing the original frame and aggregating
  9960. # with axis=0.
  9961. name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
  9962. df = df.astype(dtype, copy=False)
  9963. arr = concat_compat(list(df._iter_column_arrays()))
  9964. nrows, ncols = df.shape
  9965. row_index = np.tile(np.arange(nrows), ncols)
  9966. col_index = np.repeat(np.arange(ncols), nrows)
  9967. ser = Series(arr, index=col_index, copy=False)
  9968. # GroupBy will raise a warning with SeriesGroupBy as the object,
  9969. # likely confusing users
  9970. with rewrite_warning(
  9971. target_message=(
  9972. f"The behavior of SeriesGroupBy.{name} with all-NA values"
  9973. ),
  9974. target_category=FutureWarning,
  9975. new_message=(
  9976. f"The behavior of {type(self).__name__}.{name} with all-NA "
  9977. "values, or any-NA and skipna=False, is deprecated. In "
  9978. "a future version this will raise ValueError"
  9979. ),
  9980. ):
  9981. result = ser.groupby(row_index).agg(name, **kwds)
  9982. result.index = df.index
  9983. if not skipna and name not in ("any", "all"):
  9984. mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)
  9985. other = -1 if name in ("idxmax", "idxmin") else lib.no_default
  9986. result = result.mask(mask, other)
  9987. return result
  9988. df = df.T
  9989. # After possibly _get_data and transposing, we are now in the
  9990. # simple case where we can use BlockManager.reduce
  9991. res = df._mgr.reduce(blk_func)
  9992. out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
  9993. if out_dtype is not None and out.dtype != "boolean":
  9994. out = out.astype(out_dtype)
  9995. elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]:
  9996. out = out.astype(object)
  9997. elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"):
  9998. # Even if we are object dtype, follow numpy and return
  9999. # float64, see test_apply_funcs_over_empty
  10000. out = out.astype(np.float64)
  10001. return out
  10002. def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
  10003. """
  10004. Special case for _reduce to try to avoid a potentially-expensive transpose.
  10005. Apply the reduction block-wise along axis=1 and then reduce the resulting
  10006. 1D arrays.
  10007. """
  10008. if name == "all":
  10009. result = np.ones(len(self), dtype=bool)
  10010. ufunc = np.logical_and
  10011. elif name == "any":
  10012. result = np.zeros(len(self), dtype=bool)
  10013. # error: Incompatible types in assignment
  10014. # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
  10015. # Literal[20], Literal[False]]", variable has type
  10016. # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
  10017. # Literal[True]]")
  10018. ufunc = np.logical_or # type: ignore[assignment]
  10019. else:
  10020. raise NotImplementedError(name)
  10021. for arr in self._mgr.arrays:
  10022. middle = func(arr, axis=0, skipna=skipna)
  10023. result = ufunc(result, middle)
  10024. res_ser = self._constructor_sliced(result, index=self.index, copy=False)
  10025. return res_ser
  10026. @doc(make_doc("any", ndim=2))
  10027. # error: Signature of "any" incompatible with supertype "NDFrame"
  10028. def any( # type: ignore[override]
  10029. self,
  10030. *,
  10031. axis: Axis | None = 0,
  10032. bool_only: bool = False,
  10033. skipna: bool = True,
  10034. **kwargs,
  10035. ) -> Series | bool:
  10036. result = self._logical_func(
  10037. "any", nanops.nanany, axis, bool_only, skipna, **kwargs
  10038. )
  10039. if isinstance(result, Series):
  10040. result = result.__finalize__(self, method="any")
  10041. return result
  10042. @doc(make_doc("all", ndim=2))
  10043. def all(
  10044. self,
  10045. axis: Axis | None = 0,
  10046. bool_only: bool = False,
  10047. skipna: bool = True,
  10048. **kwargs,
  10049. ) -> Series | bool:
  10050. result = self._logical_func(
  10051. "all", nanops.nanall, axis, bool_only, skipna, **kwargs
  10052. )
  10053. if isinstance(result, Series):
  10054. result = result.__finalize__(self, method="all")
  10055. return result
  10056. @doc(make_doc("min", ndim=2))
  10057. def min(
  10058. self,
  10059. axis: Axis | None = 0,
  10060. skipna: bool = True,
  10061. numeric_only: bool = False,
  10062. **kwargs,
  10063. ):
  10064. result = super().min(axis, skipna, numeric_only, **kwargs)
  10065. if isinstance(result, Series):
  10066. result = result.__finalize__(self, method="min")
  10067. return result
  10068. @doc(make_doc("max", ndim=2))
  10069. def max(
  10070. self,
  10071. axis: Axis | None = 0,
  10072. skipna: bool = True,
  10073. numeric_only: bool = False,
  10074. **kwargs,
  10075. ):
  10076. result = super().max(axis, skipna, numeric_only, **kwargs)
  10077. if isinstance(result, Series):
  10078. result = result.__finalize__(self, method="max")
  10079. return result
  10080. @doc(make_doc("sum", ndim=2))
  10081. def sum(
  10082. self,
  10083. axis: Axis | None = 0,
  10084. skipna: bool = True,
  10085. numeric_only: bool = False,
  10086. min_count: int = 0,
  10087. **kwargs,
  10088. ):
  10089. result = super().sum(axis, skipna, numeric_only, min_count, **kwargs)
  10090. return result.__finalize__(self, method="sum")
  10091. @doc(make_doc("prod", ndim=2))
  10092. def prod(
  10093. self,
  10094. axis: Axis | None = 0,
  10095. skipna: bool = True,
  10096. numeric_only: bool = False,
  10097. min_count: int = 0,
  10098. **kwargs,
  10099. ):
  10100. result = super().prod(axis, skipna, numeric_only, min_count, **kwargs)
  10101. return result.__finalize__(self, method="prod")
  10102. @doc(make_doc("mean", ndim=2))
  10103. def mean(
  10104. self,
  10105. axis: Axis | None = 0,
  10106. skipna: bool = True,
  10107. numeric_only: bool = False,
  10108. **kwargs,
  10109. ):
  10110. result = super().mean(axis, skipna, numeric_only, **kwargs)
  10111. if isinstance(result, Series):
  10112. result = result.__finalize__(self, method="mean")
  10113. return result
  10114. @doc(make_doc("median", ndim=2))
  10115. def median(
  10116. self,
  10117. axis: Axis | None = 0,
  10118. skipna: bool = True,
  10119. numeric_only: bool = False,
  10120. **kwargs,
  10121. ):
  10122. result = super().median(axis, skipna, numeric_only, **kwargs)
  10123. if isinstance(result, Series):
  10124. result = result.__finalize__(self, method="median")
  10125. return result
  10126. @doc(make_doc("sem", ndim=2))
  10127. def sem(
  10128. self,
  10129. axis: Axis | None = 0,
  10130. skipna: bool = True,
  10131. ddof: int = 1,
  10132. numeric_only: bool = False,
  10133. **kwargs,
  10134. ):
  10135. result = super().sem(axis, skipna, ddof, numeric_only, **kwargs)
  10136. if isinstance(result, Series):
  10137. result = result.__finalize__(self, method="sem")
  10138. return result
  10139. @doc(make_doc("var", ndim=2))
  10140. def var(
  10141. self,
  10142. axis: Axis | None = 0,
  10143. skipna: bool = True,
  10144. ddof: int = 1,
  10145. numeric_only: bool = False,
  10146. **kwargs,
  10147. ):
  10148. result = super().var(axis, skipna, ddof, numeric_only, **kwargs)
  10149. if isinstance(result, Series):
  10150. result = result.__finalize__(self, method="var")
  10151. return result
  10152. @doc(make_doc("std", ndim=2))
  10153. def std(
  10154. self,
  10155. axis: Axis | None = 0,
  10156. skipna: bool = True,
  10157. ddof: int = 1,
  10158. numeric_only: bool = False,
  10159. **kwargs,
  10160. ):
  10161. result = super().std(axis, skipna, ddof, numeric_only, **kwargs)
  10162. if isinstance(result, Series):
  10163. result = result.__finalize__(self, method="std")
  10164. return result
  10165. @doc(make_doc("skew", ndim=2))
  10166. def skew(
  10167. self,
  10168. axis: Axis | None = 0,
  10169. skipna: bool = True,
  10170. numeric_only: bool = False,
  10171. **kwargs,
  10172. ):
  10173. result = super().skew(axis, skipna, numeric_only, **kwargs)
  10174. if isinstance(result, Series):
  10175. result = result.__finalize__(self, method="skew")
  10176. return result
  10177. @doc(make_doc("kurt", ndim=2))
  10178. def kurt(
  10179. self,
  10180. axis: Axis | None = 0,
  10181. skipna: bool = True,
  10182. numeric_only: bool = False,
  10183. **kwargs,
  10184. ):
  10185. result = super().kurt(axis, skipna, numeric_only, **kwargs)
  10186. if isinstance(result, Series):
  10187. result = result.__finalize__(self, method="kurt")
  10188. return result
  10189. kurtosis = kurt
  10190. product = prod
  10191. @doc(make_doc("cummin", ndim=2))
  10192. def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
  10193. return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
  10194. @doc(make_doc("cummax", ndim=2))
  10195. def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
  10196. return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
  10197. @doc(make_doc("cumsum", ndim=2))
  10198. def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
  10199. return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
  10200. @doc(make_doc("cumprod", 2))
  10201. def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
  10202. return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
  10203. def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
  10204. """
  10205. Count number of distinct elements in specified axis.
  10206. Return Series with number of distinct elements. Can ignore NaN
  10207. values.
  10208. Parameters
  10209. ----------
  10210. axis : {0 or 'index', 1 or 'columns'}, default 0
  10211. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
  10212. column-wise.
  10213. dropna : bool, default True
  10214. Don't include NaN in the counts.
  10215. Returns
  10216. -------
  10217. Series
  10218. See Also
  10219. --------
  10220. Series.nunique: Method nunique for Series.
  10221. DataFrame.count: Count non-NA cells for each column or row.
  10222. Examples
  10223. --------
  10224. >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
  10225. >>> df.nunique()
  10226. A 3
  10227. B 2
  10228. dtype: int64
  10229. >>> df.nunique(axis=1)
  10230. 0 1
  10231. 1 2
  10232. 2 2
  10233. dtype: int64
  10234. """
  10235. return self.apply(Series.nunique, axis=axis, dropna=dropna)
  10236. @doc(_shared_docs["idxmin"], numeric_only_default="False")
  10237. def idxmin(
  10238. self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
  10239. ) -> Series:
  10240. axis = self._get_axis_number(axis)
  10241. if self.empty and len(self.axes[axis]):
  10242. axis_dtype = self.axes[axis].dtype
  10243. return self._constructor_sliced(dtype=axis_dtype)
  10244. if numeric_only:
  10245. data = self._get_numeric_data()
  10246. else:
  10247. data = self
  10248. res = data._reduce(
  10249. nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
  10250. )
  10251. indices = res._values
  10252. # indices will always be np.ndarray since axis is not N
  10253. if (indices == -1).any():
  10254. warnings.warn(
  10255. f"The behavior of {type(self).__name__}.idxmin with all-NA "
  10256. "values, or any-NA and skipna=False, is deprecated. In a future "
  10257. "version this will raise ValueError",
  10258. FutureWarning,
  10259. stacklevel=find_stack_level(),
  10260. )
  10261. index = data._get_axis(axis)
  10262. result = algorithms.take(
  10263. index._values, indices, allow_fill=True, fill_value=index._na_value
  10264. )
  10265. final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
  10266. return final_result.__finalize__(self, method="idxmin")
  10267. @doc(_shared_docs["idxmax"], numeric_only_default="False")
  10268. def idxmax(
  10269. self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
  10270. ) -> Series:
  10271. axis = self._get_axis_number(axis)
  10272. if self.empty and len(self.axes[axis]):
  10273. axis_dtype = self.axes[axis].dtype
  10274. return self._constructor_sliced(dtype=axis_dtype)
  10275. if numeric_only:
  10276. data = self._get_numeric_data()
  10277. else:
  10278. data = self
  10279. res = data._reduce(
  10280. nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
  10281. )
  10282. indices = res._values
  10283. # indices will always be 1d array since axis is not None
  10284. if (indices == -1).any():
  10285. warnings.warn(
  10286. f"The behavior of {type(self).__name__}.idxmax with all-NA "
  10287. "values, or any-NA and skipna=False, is deprecated. In a future "
  10288. "version this will raise ValueError",
  10289. FutureWarning,
  10290. stacklevel=find_stack_level(),
  10291. )
  10292. index = data._get_axis(axis)
  10293. result = algorithms.take(
  10294. index._values, indices, allow_fill=True, fill_value=index._na_value
  10295. )
  10296. final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
  10297. return final_result.__finalize__(self, method="idxmax")
  10298. def _get_agg_axis(self, axis_num: int) -> Index:
  10299. """
  10300. Let's be explicit about this.
  10301. """
  10302. if axis_num == 0:
  10303. return self.columns
  10304. elif axis_num == 1:
  10305. return self.index
  10306. else:
  10307. raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
  10308. def mode(
  10309. self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
  10310. ) -> DataFrame:
  10311. """
  10312. Get the mode(s) of each element along the selected axis.
  10313. The mode of a set of values is the value that appears most often.
  10314. It can be multiple values.
  10315. Parameters
  10316. ----------
  10317. axis : {0 or 'index', 1 or 'columns'}, default 0
  10318. The axis to iterate over while searching for the mode:
  10319. * 0 or 'index' : get mode of each column
  10320. * 1 or 'columns' : get mode of each row.
  10321. numeric_only : bool, default False
  10322. If True, only apply to numeric columns.
  10323. dropna : bool, default True
  10324. Don't consider counts of NaN/NaT.
  10325. Returns
  10326. -------
  10327. DataFrame
  10328. The modes of each column or row.
  10329. See Also
  10330. --------
  10331. Series.mode : Return the highest frequency value in a Series.
  10332. Series.value_counts : Return the counts of values in a Series.
  10333. Examples
  10334. --------
  10335. >>> df = pd.DataFrame([('bird', 2, 2),
  10336. ... ('mammal', 4, np.nan),
  10337. ... ('arthropod', 8, 0),
  10338. ... ('bird', 2, np.nan)],
  10339. ... index=('falcon', 'horse', 'spider', 'ostrich'),
  10340. ... columns=('species', 'legs', 'wings'))
  10341. >>> df
  10342. species legs wings
  10343. falcon bird 2 2.0
  10344. horse mammal 4 NaN
  10345. spider arthropod 8 0.0
  10346. ostrich bird 2 NaN
  10347. By default, missing values are not considered, and the mode of wings
  10348. are both 0 and 2. Because the resulting DataFrame has two rows,
  10349. the second row of ``species`` and ``legs`` contains ``NaN``.
  10350. >>> df.mode()
  10351. species legs wings
  10352. 0 bird 2.0 0.0
  10353. 1 NaN NaN 2.0
  10354. Setting ``dropna=False`` ``NaN`` values are considered and they can be
  10355. the mode (like for wings).
  10356. >>> df.mode(dropna=False)
  10357. species legs wings
  10358. 0 bird 2 NaN
  10359. Setting ``numeric_only=True``, only the mode of numeric columns is
  10360. computed, and columns of other types are ignored.
  10361. >>> df.mode(numeric_only=True)
  10362. legs wings
  10363. 0 2.0 0.0
  10364. 1 NaN 2.0
  10365. To compute the mode over columns and not rows, use the axis parameter:
  10366. >>> df.mode(axis='columns', numeric_only=True)
  10367. 0 1
  10368. falcon 2.0 NaN
  10369. horse 4.0 NaN
  10370. spider 0.0 8.0
  10371. ostrich 2.0 NaN
  10372. """
  10373. data = self if not numeric_only else self._get_numeric_data()
  10374. def f(s):
  10375. return s.mode(dropna=dropna)
  10376. data = data.apply(f, axis=axis)
  10377. # Ensure index is type stable (should always use int index)
  10378. if data.empty:
  10379. data.index = default_index(0)
  10380. return data
  10381. @overload
  10382. def quantile(
  10383. self,
  10384. q: float = ...,
  10385. axis: Axis = ...,
  10386. numeric_only: bool = ...,
  10387. interpolation: QuantileInterpolation = ...,
  10388. method: Literal["single", "table"] = ...,
  10389. ) -> Series:
  10390. ...
  10391. @overload
  10392. def quantile(
  10393. self,
  10394. q: AnyArrayLike | Sequence[float],
  10395. axis: Axis = ...,
  10396. numeric_only: bool = ...,
  10397. interpolation: QuantileInterpolation = ...,
  10398. method: Literal["single", "table"] = ...,
  10399. ) -> Series | DataFrame:
  10400. ...
  10401. @overload
  10402. def quantile(
  10403. self,
  10404. q: float | AnyArrayLike | Sequence[float] = ...,
  10405. axis: Axis = ...,
  10406. numeric_only: bool = ...,
  10407. interpolation: QuantileInterpolation = ...,
  10408. method: Literal["single", "table"] = ...,
  10409. ) -> Series | DataFrame:
  10410. ...
  10411. def quantile(
  10412. self,
  10413. q: float | AnyArrayLike | Sequence[float] = 0.5,
  10414. axis: Axis = 0,
  10415. numeric_only: bool = False,
  10416. interpolation: QuantileInterpolation = "linear",
  10417. method: Literal["single", "table"] = "single",
  10418. ) -> Series | DataFrame:
  10419. """
  10420. Return values at the given quantile over requested axis.
  10421. Parameters
  10422. ----------
  10423. q : float or array-like, default 0.5 (50% quantile)
  10424. Value between 0 <= q <= 1, the quantile(s) to compute.
  10425. axis : {0 or 'index', 1 or 'columns'}, default 0
  10426. Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  10427. numeric_only : bool, default False
  10428. Include only `float`, `int` or `boolean` data.
  10429. .. versionchanged:: 2.0.0
  10430. The default value of ``numeric_only`` is now ``False``.
  10431. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
  10432. This optional parameter specifies the interpolation method to use,
  10433. when the desired quantile lies between two data points `i` and `j`:
  10434. * linear: `i + (j - i) * fraction`, where `fraction` is the
  10435. fractional part of the index surrounded by `i` and `j`.
  10436. * lower: `i`.
  10437. * higher: `j`.
  10438. * nearest: `i` or `j` whichever is nearest.
  10439. * midpoint: (`i` + `j`) / 2.
  10440. method : {'single', 'table'}, default 'single'
  10441. Whether to compute quantiles per-column ('single') or over all columns
  10442. ('table'). When 'table', the only allowed interpolation methods are
  10443. 'nearest', 'lower', and 'higher'.
  10444. Returns
  10445. -------
  10446. Series or DataFrame
  10447. If ``q`` is an array, a DataFrame will be returned where the
  10448. index is ``q``, the columns are the columns of self, and the
  10449. values are the quantiles.
  10450. If ``q`` is a float, a Series will be returned where the
  10451. index is the columns of self and the values are the quantiles.
  10452. See Also
  10453. --------
  10454. core.window.rolling.Rolling.quantile: Rolling quantile.
  10455. numpy.percentile: Numpy function to compute the percentile.
  10456. Examples
  10457. --------
  10458. >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
  10459. ... columns=['a', 'b'])
  10460. >>> df.quantile(.1)
  10461. a 1.3
  10462. b 3.7
  10463. Name: 0.1, dtype: float64
  10464. >>> df.quantile([.1, .5])
  10465. a b
  10466. 0.1 1.3 3.7
  10467. 0.5 2.5 55.0
  10468. Specifying `method='table'` will compute the quantile over all columns.
  10469. >>> df.quantile(.1, method="table", interpolation="nearest")
  10470. a 1
  10471. b 1
  10472. Name: 0.1, dtype: int64
  10473. >>> df.quantile([.1, .5], method="table", interpolation="nearest")
  10474. a b
  10475. 0.1 1 1
  10476. 0.5 3 100
  10477. Specifying `numeric_only=False` will also compute the quantile of
  10478. datetime and timedelta data.
  10479. >>> df = pd.DataFrame({'A': [1, 2],
  10480. ... 'B': [pd.Timestamp('2010'),
  10481. ... pd.Timestamp('2011')],
  10482. ... 'C': [pd.Timedelta('1 days'),
  10483. ... pd.Timedelta('2 days')]})
  10484. >>> df.quantile(0.5, numeric_only=False)
  10485. A 1.5
  10486. B 2010-07-02 12:00:00
  10487. C 1 days 12:00:00
  10488. Name: 0.5, dtype: object
  10489. """
  10490. validate_percentile(q)
  10491. axis = self._get_axis_number(axis)
  10492. if not is_list_like(q):
  10493. # BlockManager.quantile expects listlike, so we wrap and unwrap here
  10494. # error: List item 0 has incompatible type "float | ExtensionArray |
  10495. # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float"
  10496. res_df = self.quantile(
  10497. [q], # type: ignore[list-item]
  10498. axis=axis,
  10499. numeric_only=numeric_only,
  10500. interpolation=interpolation,
  10501. method=method,
  10502. )
  10503. if method == "single":
  10504. res = res_df.iloc[0]
  10505. else:
  10506. # cannot directly iloc over sparse arrays
  10507. res = res_df.T.iloc[:, 0]
  10508. if axis == 1 and len(self) == 0:
  10509. # GH#41544 try to get an appropriate dtype
  10510. dtype = find_common_type(list(self.dtypes))
  10511. if needs_i8_conversion(dtype):
  10512. return res.astype(dtype)
  10513. return res
  10514. q = Index(q, dtype=np.float64)
  10515. data = self._get_numeric_data() if numeric_only else self
  10516. if axis == 1:
  10517. data = data.T
  10518. if len(data.columns) == 0:
  10519. # GH#23925 _get_numeric_data may have dropped all columns
  10520. cols = Index([], name=self.columns.name)
  10521. dtype = np.float64
  10522. if axis == 1:
  10523. # GH#41544 try to get an appropriate dtype
  10524. cdtype = find_common_type(list(self.dtypes))
  10525. if needs_i8_conversion(cdtype):
  10526. dtype = cdtype
  10527. res = self._constructor([], index=q, columns=cols, dtype=dtype)
  10528. return res.__finalize__(self, method="quantile")
  10529. valid_method = {"single", "table"}
  10530. if method not in valid_method:
  10531. raise ValueError(
  10532. f"Invalid method: {method}. Method must be in {valid_method}."
  10533. )
  10534. if method == "single":
  10535. res = data._mgr.quantile(qs=q, interpolation=interpolation)
  10536. elif method == "table":
  10537. valid_interpolation = {"nearest", "lower", "higher"}
  10538. if interpolation not in valid_interpolation:
  10539. raise ValueError(
  10540. f"Invalid interpolation: {interpolation}. "
  10541. f"Interpolation must be in {valid_interpolation}"
  10542. )
  10543. # handle degenerate case
  10544. if len(data) == 0:
  10545. if data.ndim == 2:
  10546. dtype = find_common_type(list(self.dtypes))
  10547. else:
  10548. dtype = self.dtype
  10549. return self._constructor([], index=q, columns=data.columns, dtype=dtype)
  10550. q_idx = np.quantile(np.arange(len(data)), q, method=interpolation)
  10551. by = data.columns
  10552. if len(by) > 1:
  10553. keys = [data._get_label_or_level_values(x) for x in by]
  10554. indexer = lexsort_indexer(keys)
  10555. else:
  10556. k = data._get_label_or_level_values(by[0])
  10557. indexer = nargsort(k)
  10558. res = data._mgr.take(indexer[q_idx], verify=False)
  10559. res.axes[1] = q
  10560. result = self._constructor_from_mgr(res, axes=res.axes)
  10561. return result.__finalize__(self, method="quantile")
  10562. def to_timestamp(
  10563. self,
  10564. freq: Frequency | None = None,
  10565. how: ToTimestampHow = "start",
  10566. axis: Axis = 0,
  10567. copy: bool | None = None,
  10568. ) -> DataFrame:
  10569. """
  10570. Cast to DatetimeIndex of timestamps, at *beginning* of period.
  10571. Parameters
  10572. ----------
  10573. freq : str, default frequency of PeriodIndex
  10574. Desired frequency.
  10575. how : {'s', 'e', 'start', 'end'}
  10576. Convention for converting period to timestamp; start of period
  10577. vs. end.
  10578. axis : {0 or 'index', 1 or 'columns'}, default 0
  10579. The axis to convert (the index by default).
  10580. copy : bool, default True
  10581. If False then underlying input data is not copied.
  10582. .. note::
  10583. The `copy` keyword will change behavior in pandas 3.0.
  10584. `Copy-on-Write
  10585. <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
  10586. will be enabled by default, which means that all methods with a
  10587. `copy` keyword will use a lazy copy mechanism to defer the copy and
  10588. ignore the `copy` keyword. The `copy` keyword will be removed in a
  10589. future version of pandas.
  10590. You can already get the future behavior and improvements through
  10591. enabling copy on write ``pd.options.mode.copy_on_write = True``
  10592. Returns
  10593. -------
  10594. DataFrame
  10595. The DataFrame has a DatetimeIndex.
  10596. Examples
  10597. --------
  10598. >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')
  10599. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  10600. >>> df1 = pd.DataFrame(data=d, index=idx)
  10601. >>> df1
  10602. col1 col2
  10603. 2023 1 3
  10604. 2024 2 4
  10605. The resulting timestamps will be at the beginning of the year in this case
  10606. >>> df1 = df1.to_timestamp()
  10607. >>> df1
  10608. col1 col2
  10609. 2023-01-01 1 3
  10610. 2024-01-01 2 4
  10611. >>> df1.index
  10612. DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
  10613. Using `freq` which is the offset that the Timestamps will have
  10614. >>> df2 = pd.DataFrame(data=d, index=idx)
  10615. >>> df2 = df2.to_timestamp(freq='M')
  10616. >>> df2
  10617. col1 col2
  10618. 2023-01-31 1 3
  10619. 2024-01-31 2 4
  10620. >>> df2.index
  10621. DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
  10622. """
  10623. new_obj = self.copy(deep=copy and not using_copy_on_write())
  10624. axis_name = self._get_axis_name(axis)
  10625. old_ax = getattr(self, axis_name)
  10626. if not isinstance(old_ax, PeriodIndex):
  10627. raise TypeError(f"unsupported Type {type(old_ax).__name__}")
  10628. new_ax = old_ax.to_timestamp(freq=freq, how=how)
  10629. setattr(new_obj, axis_name, new_ax)
  10630. return new_obj
  10631. def to_period(
  10632. self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None
  10633. ) -> DataFrame:
  10634. """
  10635. Convert DataFrame from DatetimeIndex to PeriodIndex.
  10636. Convert DataFrame from DatetimeIndex to PeriodIndex with desired
  10637. frequency (inferred from index if not passed).
  10638. Parameters
  10639. ----------
  10640. freq : str, default
  10641. Frequency of the PeriodIndex.
  10642. axis : {0 or 'index', 1 or 'columns'}, default 0
  10643. The axis to convert (the index by default).
  10644. copy : bool, default True
  10645. If False then underlying input data is not copied.
  10646. .. note::
  10647. The `copy` keyword will change behavior in pandas 3.0.
  10648. `Copy-on-Write
  10649. <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
  10650. will be enabled by default, which means that all methods with a
  10651. `copy` keyword will use a lazy copy mechanism to defer the copy and
  10652. ignore the `copy` keyword. The `copy` keyword will be removed in a
  10653. future version of pandas.
  10654. You can already get the future behavior and improvements through
  10655. enabling copy on write ``pd.options.mode.copy_on_write = True``
  10656. Returns
  10657. -------
  10658. DataFrame
  10659. The DataFrame has a PeriodIndex.
  10660. Examples
  10661. --------
  10662. >>> idx = pd.to_datetime(
  10663. ... [
  10664. ... "2001-03-31 00:00:00",
  10665. ... "2002-05-31 00:00:00",
  10666. ... "2003-08-31 00:00:00",
  10667. ... ]
  10668. ... )
  10669. >>> idx
  10670. DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
  10671. dtype='datetime64[ns]', freq=None)
  10672. >>> idx.to_period("M")
  10673. PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
  10674. For the yearly frequency
  10675. >>> idx.to_period("Y")
  10676. PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]')
  10677. """
  10678. new_obj = self.copy(deep=copy and not using_copy_on_write())
  10679. axis_name = self._get_axis_name(axis)
  10680. old_ax = getattr(self, axis_name)
  10681. if not isinstance(old_ax, DatetimeIndex):
  10682. raise TypeError(f"unsupported Type {type(old_ax).__name__}")
  10683. new_ax = old_ax.to_period(freq=freq)
  10684. setattr(new_obj, axis_name, new_ax)
  10685. return new_obj
  10686. def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
  10687. """
  10688. Whether each element in the DataFrame is contained in values.
  10689. Parameters
  10690. ----------
  10691. values : iterable, Series, DataFrame or dict
  10692. The result will only be true at a location if all the
  10693. labels match. If `values` is a Series, that's the index. If
  10694. `values` is a dict, the keys must be the column names,
  10695. which must match. If `values` is a DataFrame,
  10696. then both the index and column labels must match.
  10697. Returns
  10698. -------
  10699. DataFrame
  10700. DataFrame of booleans showing whether each element in the DataFrame
  10701. is contained in values.
  10702. See Also
  10703. --------
  10704. DataFrame.eq: Equality test for DataFrame.
  10705. Series.isin: Equivalent method on Series.
  10706. Series.str.contains: Test if pattern or regex is contained within a
  10707. string of a Series or Index.
  10708. Examples
  10709. --------
  10710. >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
  10711. ... index=['falcon', 'dog'])
  10712. >>> df
  10713. num_legs num_wings
  10714. falcon 2 2
  10715. dog 4 0
  10716. When ``values`` is a list check whether every value in the DataFrame
  10717. is present in the list (which animals have 0 or 2 legs or wings)
  10718. >>> df.isin([0, 2])
  10719. num_legs num_wings
  10720. falcon True True
  10721. dog False True
  10722. To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
  10723. >>> ~df.isin([0, 2])
  10724. num_legs num_wings
  10725. falcon False False
  10726. dog True False
  10727. When ``values`` is a dict, we can pass values to check for each
  10728. column separately:
  10729. >>> df.isin({'num_wings': [0, 3]})
  10730. num_legs num_wings
  10731. falcon False False
  10732. dog False True
  10733. When ``values`` is a Series or DataFrame the index and column must
  10734. match. Note that 'falcon' does not match based on the number of legs
  10735. in other.
  10736. >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
  10737. ... index=['spider', 'falcon'])
  10738. >>> df.isin(other)
  10739. num_legs num_wings
  10740. falcon False True
  10741. dog False False
  10742. """
  10743. if isinstance(values, dict):
  10744. from pandas.core.reshape.concat import concat
  10745. values = collections.defaultdict(list, values)
  10746. result = concat(
  10747. (
  10748. self.iloc[:, [i]].isin(values[col])
  10749. for i, col in enumerate(self.columns)
  10750. ),
  10751. axis=1,
  10752. )
  10753. elif isinstance(values, Series):
  10754. if not values.index.is_unique:
  10755. raise ValueError("cannot compute isin with a duplicate axis.")
  10756. result = self.eq(values.reindex_like(self), axis="index")
  10757. elif isinstance(values, DataFrame):
  10758. if not (values.columns.is_unique and values.index.is_unique):
  10759. raise ValueError("cannot compute isin with a duplicate axis.")
  10760. result = self.eq(values.reindex_like(self))
  10761. else:
  10762. if not is_list_like(values):
  10763. raise TypeError(
  10764. "only list-like or dict-like objects are allowed "
  10765. "to be passed to DataFrame.isin(), "
  10766. f"you passed a '{type(values).__name__}'"
  10767. )
  10768. def isin_(x):
  10769. # error: Argument 2 to "isin" has incompatible type "Union[Series,
  10770. # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected
  10771. # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index,
  10772. # Series], List[Any], range]"
  10773. result = algorithms.isin(
  10774. x.ravel(),
  10775. values, # type: ignore[arg-type]
  10776. )
  10777. return result.reshape(x.shape)
  10778. res_mgr = self._mgr.apply(isin_)
  10779. result = self._constructor_from_mgr(
  10780. res_mgr,
  10781. axes=res_mgr.axes,
  10782. )
  10783. return result.__finalize__(self, method="isin")
  10784. # ----------------------------------------------------------------------
  10785. # Add index and columns
  10786. _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
  10787. _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
  10788. **NDFrame._AXIS_TO_AXIS_NUMBER,
  10789. 1: 1,
  10790. "columns": 1,
  10791. }
  10792. _AXIS_LEN = len(_AXIS_ORDERS)
  10793. _info_axis_number: Literal[1] = 1
  10794. _info_axis_name: Literal["columns"] = "columns"
  10795. index = properties.AxisProperty(
  10796. axis=1,
  10797. doc="""
  10798. The index (row labels) of the DataFrame.
  10799. The index of a DataFrame is a series of labels that identify each row.
  10800. The labels can be integers, strings, or any other hashable type. The index
  10801. is used for label-based access and alignment, and can be accessed or
  10802. modified using this attribute.
  10803. Returns
  10804. -------
  10805. pandas.Index
  10806. The index labels of the DataFrame.
  10807. See Also
  10808. --------
  10809. DataFrame.columns : The column labels of the DataFrame.
  10810. DataFrame.to_numpy : Convert the DataFrame to a NumPy array.
  10811. Examples
  10812. --------
  10813. >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'],
  10814. ... 'Age': [25, 30, 35],
  10815. ... 'Location': ['Seattle', 'New York', 'Kona']},
  10816. ... index=([10, 20, 30]))
  10817. >>> df.index
  10818. Index([10, 20, 30], dtype='int64')
  10819. In this example, we create a DataFrame with 3 rows and 3 columns,
  10820. including Name, Age, and Location information. We set the index labels to
  10821. be the integers 10, 20, and 30. We then access the `index` attribute of the
  10822. DataFrame, which returns an `Index` object containing the index labels.
  10823. >>> df.index = [100, 200, 300]
  10824. >>> df
  10825. Name Age Location
  10826. 100 Alice 25 Seattle
  10827. 200 Bob 30 New York
  10828. 300 Aritra 35 Kona
  10829. In this example, we modify the index labels of the DataFrame by assigning
  10830. a new list of labels to the `index` attribute. The DataFrame is then
  10831. updated with the new labels, and the output shows the modified DataFrame.
  10832. """,
  10833. )
  10834. columns = properties.AxisProperty(
  10835. axis=0,
  10836. doc=dedent(
  10837. """
  10838. The column labels of the DataFrame.
  10839. Examples
  10840. --------
  10841. >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
  10842. >>> df
  10843. A B
  10844. 0 1 3
  10845. 1 2 4
  10846. >>> df.columns
  10847. Index(['A', 'B'], dtype='object')
  10848. """
  10849. ),
  10850. )
  10851. # ----------------------------------------------------------------------
  10852. # Add plotting methods to DataFrame
  10853. plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
  10854. hist = pandas.plotting.hist_frame
  10855. boxplot = pandas.plotting.boxplot_frame
  10856. sparse = CachedAccessor("sparse", SparseFrameAccessor)
  10857. # ----------------------------------------------------------------------
  10858. # Internal Interface Methods
  10859. def _to_dict_of_blocks(self):
  10860. """
  10861. Return a dict of dtype -> Constructor Types that
  10862. each is a homogeneous dtype.
  10863. Internal ONLY - only works for BlockManager
  10864. """
  10865. mgr = self._mgr
  10866. # convert to BlockManager if needed -> this way support ArrayManager as well
  10867. mgr = cast(BlockManager, mgr_to_mgr(mgr, "block"))
  10868. return {
  10869. k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)
  10870. for k, v, in mgr.to_dict().items()
  10871. }
  10872. @property
  10873. def values(self) -> np.ndarray:
  10874. """
  10875. Return a Numpy representation of the DataFrame.
  10876. .. warning::
  10877. We recommend using :meth:`DataFrame.to_numpy` instead.
  10878. Only the values in the DataFrame will be returned, the axes labels
  10879. will be removed.
  10880. Returns
  10881. -------
  10882. numpy.ndarray
  10883. The values of the DataFrame.
  10884. See Also
  10885. --------
  10886. DataFrame.to_numpy : Recommended alternative to this method.
  10887. DataFrame.index : Retrieve the index labels.
  10888. DataFrame.columns : Retrieving the column names.
  10889. Notes
  10890. -----
  10891. The dtype will be a lower-common-denominator dtype (implicit
  10892. upcasting); that is to say if the dtypes (even of numeric types)
  10893. are mixed, the one that accommodates all will be chosen. Use this
  10894. with care if you are not dealing with the blocks.
  10895. e.g. If the dtypes are float16 and float32, dtype will be upcast to
  10896. float32. If dtypes are int32 and uint8, dtype will be upcast to
  10897. int32. By :func:`numpy.find_common_type` convention, mixing int64
  10898. and uint64 will result in a float64 dtype.
  10899. Examples
  10900. --------
  10901. A DataFrame where all columns are the same type (e.g., int64) results
  10902. in an array of the same type.
  10903. >>> df = pd.DataFrame({'age': [ 3, 29],
  10904. ... 'height': [94, 170],
  10905. ... 'weight': [31, 115]})
  10906. >>> df
  10907. age height weight
  10908. 0 3 94 31
  10909. 1 29 170 115
  10910. >>> df.dtypes
  10911. age int64
  10912. height int64
  10913. weight int64
  10914. dtype: object
  10915. >>> df.values
  10916. array([[ 3, 94, 31],
  10917. [ 29, 170, 115]])
  10918. A DataFrame with mixed type columns(e.g., str/object, int64, float32)
  10919. results in an ndarray of the broadest type that accommodates these
  10920. mixed types (e.g., object).
  10921. >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
  10922. ... ('lion', 80.5, 1),
  10923. ... ('monkey', np.nan, None)],
  10924. ... columns=('name', 'max_speed', 'rank'))
  10925. >>> df2.dtypes
  10926. name object
  10927. max_speed float64
  10928. rank object
  10929. dtype: object
  10930. >>> df2.values
  10931. array([['parrot', 24.0, 'second'],
  10932. ['lion', 80.5, 1],
  10933. ['monkey', nan, None]], dtype=object)
  10934. """
  10935. return self._mgr.as_array()
  10936. def _from_nested_dict(data) -> collections.defaultdict:
  10937. new_data: collections.defaultdict = collections.defaultdict(dict)
  10938. for index, s in data.items():
  10939. for col, v in s.items():
  10940. new_data[col][index] = v
  10941. return new_data
  10942. def _reindex_for_setitem(
  10943. value: DataFrame | Series, index: Index
  10944. ) -> tuple[ArrayLike, BlockValuesRefs | None]:
  10945. # reindex if necessary
  10946. if value.index.equals(index) or not len(index):
  10947. if using_copy_on_write() and isinstance(value, Series):
  10948. return value._values, value._references
  10949. return value._values.copy(), None
  10950. # GH#4107
  10951. try:
  10952. reindexed_value = value.reindex(index)._values
  10953. except ValueError as err:
  10954. # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
  10955. if not value.index.is_unique:
  10956. # duplicate axis
  10957. raise err
  10958. raise TypeError(
  10959. "incompatible index of inserted column with frame index"
  10960. ) from err
  10961. return reindexed_value, None