pytables.py 177 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532
  1. """
  2. High level interface to PyTables for reading and writing pandas data structures
  3. to disk
  4. """
  5. from __future__ import annotations
  6. from contextlib import suppress
  7. import copy
  8. from datetime import (
  9. date,
  10. tzinfo,
  11. )
  12. import itertools
  13. import os
  14. import re
  15. from textwrap import dedent
  16. from typing import (
  17. TYPE_CHECKING,
  18. Any,
  19. Callable,
  20. Final,
  21. Literal,
  22. cast,
  23. overload,
  24. )
  25. import warnings
  26. import numpy as np
  27. from pandas._config import (
  28. config,
  29. get_option,
  30. using_copy_on_write,
  31. using_string_dtype,
  32. )
  33. from pandas._libs import (
  34. lib,
  35. writers as libwriters,
  36. )
  37. from pandas._libs.lib import is_string_array
  38. from pandas._libs.tslibs import timezones
  39. from pandas.compat import HAS_PYARROW
  40. from pandas.compat._optional import import_optional_dependency
  41. from pandas.compat.pickle_compat import patch_pickle
  42. from pandas.errors import (
  43. AttributeConflictWarning,
  44. ClosedFileError,
  45. IncompatibilityWarning,
  46. PerformanceWarning,
  47. PossibleDataLossError,
  48. )
  49. from pandas.util._decorators import cache_readonly
  50. from pandas.util._exceptions import find_stack_level
  51. from pandas.core.dtypes.common import (
  52. ensure_object,
  53. is_bool_dtype,
  54. is_complex_dtype,
  55. is_list_like,
  56. is_string_dtype,
  57. needs_i8_conversion,
  58. )
  59. from pandas.core.dtypes.dtypes import (
  60. CategoricalDtype,
  61. DatetimeTZDtype,
  62. ExtensionDtype,
  63. PeriodDtype,
  64. )
  65. from pandas.core.dtypes.missing import array_equivalent
  66. from pandas import (
  67. DataFrame,
  68. DatetimeIndex,
  69. Index,
  70. MultiIndex,
  71. PeriodIndex,
  72. RangeIndex,
  73. Series,
  74. StringDtype,
  75. TimedeltaIndex,
  76. concat,
  77. isna,
  78. )
  79. from pandas.core.arrays import (
  80. Categorical,
  81. DatetimeArray,
  82. PeriodArray,
  83. )
  84. from pandas.core.arrays.string_ import BaseStringArray
  85. import pandas.core.common as com
  86. from pandas.core.computation.pytables import (
  87. PyTablesExpr,
  88. maybe_expression,
  89. )
  90. from pandas.core.construction import (
  91. array as pd_array,
  92. extract_array,
  93. )
  94. from pandas.core.indexes.api import ensure_index
  95. from pandas.core.internals import (
  96. ArrayManager,
  97. BlockManager,
  98. )
  99. from pandas.io.common import stringify_path
  100. from pandas.io.formats.printing import (
  101. adjoin,
  102. pprint_thing,
  103. )
  104. if TYPE_CHECKING:
  105. from collections.abc import (
  106. Hashable,
  107. Iterator,
  108. Sequence,
  109. )
  110. from types import TracebackType
  111. from tables import (
  112. Col,
  113. File,
  114. Node,
  115. )
  116. from pandas._typing import (
  117. AnyArrayLike,
  118. ArrayLike,
  119. AxisInt,
  120. DtypeArg,
  121. FilePath,
  122. Self,
  123. Shape,
  124. npt,
  125. )
  126. from pandas.core.internals import Block
  127. # versioning attribute
  128. _version = "0.15.2"
  129. # encoding
  130. _default_encoding = "UTF-8"
  131. def _ensure_decoded(s):
  132. """if we have bytes, decode them to unicode"""
  133. if isinstance(s, np.bytes_):
  134. s = s.decode("UTF-8")
  135. return s
  136. def _ensure_encoding(encoding: str | None) -> str:
  137. # set the encoding if we need
  138. if encoding is None:
  139. encoding = _default_encoding
  140. return encoding
  141. def _ensure_str(name):
  142. """
  143. Ensure that an index / column name is a str (python 3); otherwise they
  144. may be np.string dtype. Non-string dtypes are passed through unchanged.
  145. https://github.com/pandas-dev/pandas/issues/13492
  146. """
  147. if isinstance(name, str):
  148. name = str(name)
  149. return name
  150. Term = PyTablesExpr
  151. def _ensure_term(where, scope_level: int):
  152. """
  153. Ensure that the where is a Term or a list of Term.
  154. This makes sure that we are capturing the scope of variables that are
  155. passed create the terms here with a frame_level=2 (we are 2 levels down)
  156. """
  157. # only consider list/tuple here as an ndarray is automatically a coordinate
  158. # list
  159. level = scope_level + 1
  160. if isinstance(where, (list, tuple)):
  161. where = [
  162. Term(term, scope_level=level + 1) if maybe_expression(term) else term
  163. for term in where
  164. if term is not None
  165. ]
  166. elif maybe_expression(where):
  167. where = Term(where, scope_level=level)
  168. return where if where is None or len(where) else None
  169. incompatibility_doc: Final = """
  170. where criteria is being ignored as this version [%s] is too old (or
  171. not-defined), read the file in and write it out to a new file to upgrade (with
  172. the copy_to method)
  173. """
  174. attribute_conflict_doc: Final = """
  175. the [%s] attribute of the existing index is [%s] which conflicts with the new
  176. [%s], resetting the attribute to None
  177. """
  178. performance_doc: Final = """
  179. your performance may suffer as PyTables will pickle object types that it cannot
  180. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  181. """
  182. # formats
  183. _FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
  184. # axes map
  185. _AXES_MAP = {DataFrame: [0]}
  186. # register our configuration options
  187. dropna_doc: Final = """
  188. : boolean
  189. drop ALL nan rows when appending to a table
  190. """
  191. format_doc: Final = """
  192. : format
  193. default format writing format, if None, then
  194. put will default to 'fixed' and append will default to 'table'
  195. """
  196. with config.config_prefix("io.hdf"):
  197. config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
  198. config.register_option(
  199. "default_format",
  200. None,
  201. format_doc,
  202. validator=config.is_one_of_factory(["fixed", "table", None]),
  203. )
  204. # oh the troubles to reduce import time
  205. _table_mod = None
  206. _table_file_open_policy_is_strict = False
  207. def _tables():
  208. global _table_mod
  209. global _table_file_open_policy_is_strict
  210. if _table_mod is None:
  211. import tables
  212. _table_mod = tables
  213. # set the file open policy
  214. # return the file open policy; this changes as of pytables 3.1
  215. # depending on the HDF5 version
  216. with suppress(AttributeError):
  217. _table_file_open_policy_is_strict = (
  218. tables.file._FILE_OPEN_POLICY == "strict"
  219. )
  220. return _table_mod
  221. # interface to/from ###
  222. def to_hdf(
  223. path_or_buf: FilePath | HDFStore,
  224. key: str,
  225. value: DataFrame | Series,
  226. mode: str = "a",
  227. complevel: int | None = None,
  228. complib: str | None = None,
  229. append: bool = False,
  230. format: str | None = None,
  231. index: bool = True,
  232. min_itemsize: int | dict[str, int] | None = None,
  233. nan_rep=None,
  234. dropna: bool | None = None,
  235. data_columns: Literal[True] | list[str] | None = None,
  236. errors: str = "strict",
  237. encoding: str = "UTF-8",
  238. ) -> None:
  239. """store this object, close it if we opened it"""
  240. if append:
  241. f = lambda store: store.append(
  242. key,
  243. value,
  244. format=format,
  245. index=index,
  246. min_itemsize=min_itemsize,
  247. nan_rep=nan_rep,
  248. dropna=dropna,
  249. data_columns=data_columns,
  250. errors=errors,
  251. encoding=encoding,
  252. )
  253. else:
  254. # NB: dropna is not passed to `put`
  255. f = lambda store: store.put(
  256. key,
  257. value,
  258. format=format,
  259. index=index,
  260. min_itemsize=min_itemsize,
  261. nan_rep=nan_rep,
  262. data_columns=data_columns,
  263. errors=errors,
  264. encoding=encoding,
  265. dropna=dropna,
  266. )
  267. path_or_buf = stringify_path(path_or_buf)
  268. if isinstance(path_or_buf, str):
  269. with HDFStore(
  270. path_or_buf, mode=mode, complevel=complevel, complib=complib
  271. ) as store:
  272. f(store)
  273. else:
  274. f(path_or_buf)
  275. def read_hdf(
  276. path_or_buf: FilePath | HDFStore,
  277. key=None,
  278. mode: str = "r",
  279. errors: str = "strict",
  280. where: str | list | None = None,
  281. start: int | None = None,
  282. stop: int | None = None,
  283. columns: list[str] | None = None,
  284. iterator: bool = False,
  285. chunksize: int | None = None,
  286. **kwargs,
  287. ):
  288. """
  289. Read from the store, close it if we opened it.
  290. Retrieve pandas object stored in file, optionally based on where
  291. criteria.
  292. .. warning::
  293. Pandas uses PyTables for reading and writing HDF5 files, which allows
  294. serializing object-dtype data with pickle when using the "fixed" format.
  295. Loading pickled data received from untrusted sources can be unsafe.
  296. See: https://docs.python.org/3/library/pickle.html for more.
  297. Parameters
  298. ----------
  299. path_or_buf : str, path object, pandas.HDFStore
  300. Any valid string path is acceptable. Only supports the local file system,
  301. remote URLs and file-like objects are not supported.
  302. If you want to pass in a path object, pandas accepts any
  303. ``os.PathLike``.
  304. Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
  305. key : object, optional
  306. The group identifier in the store. Can be omitted if the HDF file
  307. contains a single pandas object.
  308. mode : {'r', 'r+', 'a'}, default 'r'
  309. Mode to use when opening the file. Ignored if path_or_buf is a
  310. :class:`pandas.HDFStore`. Default is 'r'.
  311. errors : str, default 'strict'
  312. Specifies how encoding and decoding errors are to be handled.
  313. See the errors argument for :func:`open` for a full list
  314. of options.
  315. where : list, optional
  316. A list of Term (or convertible) objects.
  317. start : int, optional
  318. Row number to start selection.
  319. stop : int, optional
  320. Row number to stop selection.
  321. columns : list, optional
  322. A list of columns names to return.
  323. iterator : bool, optional
  324. Return an iterator object.
  325. chunksize : int, optional
  326. Number of rows to include in an iteration when using an iterator.
  327. **kwargs
  328. Additional keyword arguments passed to HDFStore.
  329. Returns
  330. -------
  331. object
  332. The selected object. Return type depends on the object stored.
  333. See Also
  334. --------
  335. DataFrame.to_hdf : Write a HDF file from a DataFrame.
  336. HDFStore : Low-level access to HDF files.
  337. Notes
  338. -----
  339. When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
  340. and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
  341. to UTF-8, the resulting dtype will be
  342. ``pd.StringDtype(storage="python", na_value=np.nan)``.
  343. Examples
  344. --------
  345. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
  346. >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
  347. >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
  348. """
  349. if mode not in ["r", "r+", "a"]:
  350. raise ValueError(
  351. f"mode {mode} is not allowed while performing a read. "
  352. f"Allowed modes are r, r+ and a."
  353. )
  354. # grab the scope
  355. if where is not None:
  356. where = _ensure_term(where, scope_level=1)
  357. if isinstance(path_or_buf, HDFStore):
  358. if not path_or_buf.is_open:
  359. raise OSError("The HDFStore must be open for reading.")
  360. store = path_or_buf
  361. auto_close = False
  362. else:
  363. path_or_buf = stringify_path(path_or_buf)
  364. if not isinstance(path_or_buf, str):
  365. raise NotImplementedError(
  366. "Support for generic buffers has not been implemented."
  367. )
  368. try:
  369. exists = os.path.exists(path_or_buf)
  370. # if filepath is too long
  371. except (TypeError, ValueError):
  372. exists = False
  373. if not exists:
  374. raise FileNotFoundError(f"File {path_or_buf} does not exist")
  375. store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
  376. # can't auto open/close if we are using an iterator
  377. # so delegate to the iterator
  378. auto_close = True
  379. try:
  380. if key is None:
  381. groups = store.groups()
  382. if len(groups) == 0:
  383. raise ValueError(
  384. "Dataset(s) incompatible with Pandas data types, "
  385. "not table, or no datasets found in HDF5 file."
  386. )
  387. candidate_only_group = groups[0]
  388. # For the HDF file to have only one dataset, all other groups
  389. # should then be metadata groups for that candidate group. (This
  390. # assumes that the groups() method enumerates parent groups
  391. # before their children.)
  392. for group_to_check in groups[1:]:
  393. if not _is_metadata_of(group_to_check, candidate_only_group):
  394. raise ValueError(
  395. "key must be provided when HDF5 "
  396. "file contains multiple datasets."
  397. )
  398. key = candidate_only_group._v_pathname
  399. return store.select(
  400. key,
  401. where=where,
  402. start=start,
  403. stop=stop,
  404. columns=columns,
  405. iterator=iterator,
  406. chunksize=chunksize,
  407. auto_close=auto_close,
  408. )
  409. except (ValueError, TypeError, LookupError):
  410. if not isinstance(path_or_buf, HDFStore):
  411. # if there is an error, close the store if we opened it.
  412. with suppress(AttributeError):
  413. store.close()
  414. raise
  415. def _is_metadata_of(group: Node, parent_group: Node) -> bool:
  416. """Check if a given group is a metadata group for a given parent_group."""
  417. if group._v_depth <= parent_group._v_depth:
  418. return False
  419. current = group
  420. while current._v_depth > 1:
  421. parent = current._v_parent
  422. if parent == parent_group and current._v_name == "meta":
  423. return True
  424. current = current._v_parent
  425. return False
  426. class HDFStore:
  427. """
  428. Dict-like IO interface for storing pandas objects in PyTables.
  429. Either Fixed or Table format.
  430. .. warning::
  431. Pandas uses PyTables for reading and writing HDF5 files, which allows
  432. serializing object-dtype data with pickle when using the "fixed" format.
  433. Loading pickled data received from untrusted sources can be unsafe.
  434. See: https://docs.python.org/3/library/pickle.html for more.
  435. Parameters
  436. ----------
  437. path : str
  438. File path to HDF5 file.
  439. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  440. ``'r'``
  441. Read-only; no data can be modified.
  442. ``'w'``
  443. Write; a new file is created (an existing file with the same
  444. name would be deleted).
  445. ``'a'``
  446. Append; an existing file is opened for reading and writing,
  447. and if the file does not exist it is created.
  448. ``'r+'``
  449. It is similar to ``'a'``, but the file must already exist.
  450. complevel : int, 0-9, default None
  451. Specifies a compression level for data.
  452. A value of 0 or None disables compression.
  453. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  454. Specifies the compression library to be used.
  455. These additional compressors for Blosc are supported
  456. (default if no compressor specified: 'blosc:blosclz'):
  457. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  458. 'blosc:zlib', 'blosc:zstd'}.
  459. Specifying a compression library which is not available issues
  460. a ValueError.
  461. fletcher32 : bool, default False
  462. If applying compression use the fletcher32 checksum.
  463. **kwargs
  464. These parameters will be passed to the PyTables open_file method.
  465. Examples
  466. --------
  467. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  468. >>> store = pd.HDFStore('test.h5')
  469. >>> store['foo'] = bar # write to HDF5
  470. >>> bar = store['foo'] # retrieve
  471. >>> store.close()
  472. **Create or load HDF5 file in-memory**
  473. When passing the `driver` option to the PyTables open_file method through
  474. **kwargs, the HDF5 file is loaded or created in-memory and will only be
  475. written when closed:
  476. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  477. >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
  478. >>> store['foo'] = bar
  479. >>> store.close() # only now, data is written to disk
  480. """
  481. _handle: File | None
  482. _mode: str
  483. def __init__(
  484. self,
  485. path,
  486. mode: str = "a",
  487. complevel: int | None = None,
  488. complib=None,
  489. fletcher32: bool = False,
  490. **kwargs,
  491. ) -> None:
  492. if "format" in kwargs:
  493. raise ValueError("format is not a defined argument for HDFStore")
  494. tables = import_optional_dependency("tables")
  495. if complib is not None and complib not in tables.filters.all_complibs:
  496. raise ValueError(
  497. f"complib only supports {tables.filters.all_complibs} compression."
  498. )
  499. if complib is None and complevel is not None:
  500. complib = tables.filters.default_complib
  501. self._path = stringify_path(path)
  502. if mode is None:
  503. mode = "a"
  504. self._mode = mode
  505. self._handle = None
  506. self._complevel = complevel if complevel else 0
  507. self._complib = complib
  508. self._fletcher32 = fletcher32
  509. self._filters = None
  510. self.open(mode=mode, **kwargs)
  511. def __fspath__(self) -> str:
  512. return self._path
  513. @property
  514. def root(self):
  515. """return the root node"""
  516. self._check_if_open()
  517. assert self._handle is not None # for mypy
  518. return self._handle.root
  519. @property
  520. def filename(self) -> str:
  521. return self._path
  522. def __getitem__(self, key: str):
  523. return self.get(key)
  524. def __setitem__(self, key: str, value) -> None:
  525. self.put(key, value)
  526. def __delitem__(self, key: str) -> None:
  527. return self.remove(key)
  528. def __getattr__(self, name: str):
  529. """allow attribute access to get stores"""
  530. try:
  531. return self.get(name)
  532. except (KeyError, ClosedFileError):
  533. pass
  534. raise AttributeError(
  535. f"'{type(self).__name__}' object has no attribute '{name}'"
  536. )
  537. def __contains__(self, key: str) -> bool:
  538. """
  539. check for existence of this key
  540. can match the exact pathname or the pathnm w/o the leading '/'
  541. """
  542. node = self.get_node(key)
  543. if node is not None:
  544. name = node._v_pathname
  545. if key in (name, name[1:]):
  546. return True
  547. return False
  548. def __len__(self) -> int:
  549. return len(self.groups())
  550. def __repr__(self) -> str:
  551. pstr = pprint_thing(self._path)
  552. return f"{type(self)}\nFile path: {pstr}\n"
  553. def __enter__(self) -> Self:
  554. return self
  555. def __exit__(
  556. self,
  557. exc_type: type[BaseException] | None,
  558. exc_value: BaseException | None,
  559. traceback: TracebackType | None,
  560. ) -> None:
  561. self.close()
  562. def keys(self, include: str = "pandas") -> list[str]:
  563. """
  564. Return a list of keys corresponding to objects stored in HDFStore.
  565. Parameters
  566. ----------
  567. include : str, default 'pandas'
  568. When kind equals 'pandas' return pandas objects.
  569. When kind equals 'native' return native HDF5 Table objects.
  570. Returns
  571. -------
  572. list
  573. List of ABSOLUTE path-names (e.g. have the leading '/').
  574. Raises
  575. ------
  576. raises ValueError if kind has an illegal value
  577. Examples
  578. --------
  579. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  580. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  581. >>> store.put('data', df) # doctest: +SKIP
  582. >>> store.get('data') # doctest: +SKIP
  583. >>> print(store.keys()) # doctest: +SKIP
  584. ['/data1', '/data2']
  585. >>> store.close() # doctest: +SKIP
  586. """
  587. if include == "pandas":
  588. return [n._v_pathname for n in self.groups()]
  589. elif include == "native":
  590. assert self._handle is not None # mypy
  591. return [
  592. n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
  593. ]
  594. raise ValueError(
  595. f"`include` should be either 'pandas' or 'native' but is '{include}'"
  596. )
  597. def __iter__(self) -> Iterator[str]:
  598. return iter(self.keys())
  599. def items(self) -> Iterator[tuple[str, list]]:
  600. """
  601. iterate on key->group
  602. """
  603. for g in self.groups():
  604. yield g._v_pathname, g
  605. def open(self, mode: str = "a", **kwargs) -> None:
  606. """
  607. Open the file in the specified mode
  608. Parameters
  609. ----------
  610. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  611. See HDFStore docstring or tables.open_file for info about modes
  612. **kwargs
  613. These parameters will be passed to the PyTables open_file method.
  614. """
  615. tables = _tables()
  616. if self._mode != mode:
  617. # if we are changing a write mode to read, ok
  618. if self._mode in ["a", "w"] and mode in ["r", "r+"]:
  619. pass
  620. elif mode in ["w"]:
  621. # this would truncate, raise here
  622. if self.is_open:
  623. raise PossibleDataLossError(
  624. f"Re-opening the file [{self._path}] with mode [{self._mode}] "
  625. "will delete the current file!"
  626. )
  627. self._mode = mode
  628. # close and reopen the handle
  629. if self.is_open:
  630. self.close()
  631. if self._complevel and self._complevel > 0:
  632. self._filters = _tables().Filters(
  633. self._complevel, self._complib, fletcher32=self._fletcher32
  634. )
  635. if _table_file_open_policy_is_strict and self.is_open:
  636. msg = (
  637. "Cannot open HDF5 file, which is already opened, "
  638. "even in read-only mode."
  639. )
  640. raise ValueError(msg)
  641. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  642. def close(self) -> None:
  643. """
  644. Close the PyTables file handle
  645. """
  646. if self._handle is not None:
  647. self._handle.close()
  648. self._handle = None
  649. @property
  650. def is_open(self) -> bool:
  651. """
  652. return a boolean indicating whether the file is open
  653. """
  654. if self._handle is None:
  655. return False
  656. return bool(self._handle.isopen)
  657. def flush(self, fsync: bool = False) -> None:
  658. """
  659. Force all buffered modifications to be written to disk.
  660. Parameters
  661. ----------
  662. fsync : bool (default False)
  663. call ``os.fsync()`` on the file handle to force writing to disk.
  664. Notes
  665. -----
  666. Without ``fsync=True``, flushing may not guarantee that the OS writes
  667. to disk. With fsync, the operation will block until the OS claims the
  668. file has been written; however, other caching layers may still
  669. interfere.
  670. """
  671. if self._handle is not None:
  672. self._handle.flush()
  673. if fsync:
  674. with suppress(OSError):
  675. os.fsync(self._handle.fileno())
  676. def get(self, key: str):
  677. """
  678. Retrieve pandas object stored in file.
  679. Parameters
  680. ----------
  681. key : str
  682. Returns
  683. -------
  684. object
  685. Same type as object stored in file.
  686. Examples
  687. --------
  688. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  689. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  690. >>> store.put('data', df) # doctest: +SKIP
  691. >>> store.get('data') # doctest: +SKIP
  692. >>> store.close() # doctest: +SKIP
  693. """
  694. with patch_pickle():
  695. # GH#31167 Without this patch, pickle doesn't know how to unpickle
  696. # old DateOffset objects now that they are cdef classes.
  697. group = self.get_node(key)
  698. if group is None:
  699. raise KeyError(f"No object named {key} in the file")
  700. return self._read_group(group)
  701. def select(
  702. self,
  703. key: str,
  704. where=None,
  705. start=None,
  706. stop=None,
  707. columns=None,
  708. iterator: bool = False,
  709. chunksize: int | None = None,
  710. auto_close: bool = False,
  711. ):
  712. """
  713. Retrieve pandas object stored in file, optionally based on where criteria.
  714. .. warning::
  715. Pandas uses PyTables for reading and writing HDF5 files, which allows
  716. serializing object-dtype data with pickle when using the "fixed" format.
  717. Loading pickled data received from untrusted sources can be unsafe.
  718. See: https://docs.python.org/3/library/pickle.html for more.
  719. Parameters
  720. ----------
  721. key : str
  722. Object being retrieved from file.
  723. where : list or None
  724. List of Term (or convertible) objects, optional.
  725. start : int or None
  726. Row number to start selection.
  727. stop : int, default None
  728. Row number to stop selection.
  729. columns : list or None
  730. A list of columns that if not None, will limit the return columns.
  731. iterator : bool or False
  732. Returns an iterator.
  733. chunksize : int or None
  734. Number or rows to include in iteration, return an iterator.
  735. auto_close : bool or False
  736. Should automatically close the store when finished.
  737. Returns
  738. -------
  739. object
  740. Retrieved object from file.
  741. Examples
  742. --------
  743. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  744. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  745. >>> store.put('data', df) # doctest: +SKIP
  746. >>> store.get('data') # doctest: +SKIP
  747. >>> print(store.keys()) # doctest: +SKIP
  748. ['/data1', '/data2']
  749. >>> store.select('/data1') # doctest: +SKIP
  750. A B
  751. 0 1 2
  752. 1 3 4
  753. >>> store.select('/data1', where='columns == A') # doctest: +SKIP
  754. A
  755. 0 1
  756. 1 3
  757. >>> store.close() # doctest: +SKIP
  758. """
  759. group = self.get_node(key)
  760. if group is None:
  761. raise KeyError(f"No object named {key} in the file")
  762. # create the storer and axes
  763. where = _ensure_term(where, scope_level=1)
  764. s = self._create_storer(group)
  765. s.infer_axes()
  766. # function to call on iteration
  767. def func(_start, _stop, _where):
  768. return s.read(start=_start, stop=_stop, where=_where, columns=columns)
  769. # create the iterator
  770. it = TableIterator(
  771. self,
  772. s,
  773. func,
  774. where=where,
  775. nrows=s.nrows,
  776. start=start,
  777. stop=stop,
  778. iterator=iterator,
  779. chunksize=chunksize,
  780. auto_close=auto_close,
  781. )
  782. return it.get_result()
  783. def select_as_coordinates(
  784. self,
  785. key: str,
  786. where=None,
  787. start: int | None = None,
  788. stop: int | None = None,
  789. ):
  790. """
  791. return the selection as an Index
  792. .. warning::
  793. Pandas uses PyTables for reading and writing HDF5 files, which allows
  794. serializing object-dtype data with pickle when using the "fixed" format.
  795. Loading pickled data received from untrusted sources can be unsafe.
  796. See: https://docs.python.org/3/library/pickle.html for more.
  797. Parameters
  798. ----------
  799. key : str
  800. where : list of Term (or convertible) objects, optional
  801. start : integer (defaults to None), row number to start selection
  802. stop : integer (defaults to None), row number to stop selection
  803. """
  804. where = _ensure_term(where, scope_level=1)
  805. tbl = self.get_storer(key)
  806. if not isinstance(tbl, Table):
  807. raise TypeError("can only read_coordinates with a table")
  808. return tbl.read_coordinates(where=where, start=start, stop=stop)
  809. def select_column(
  810. self,
  811. key: str,
  812. column: str,
  813. start: int | None = None,
  814. stop: int | None = None,
  815. ):
  816. """
  817. return a single column from the table. This is generally only useful to
  818. select an indexable
  819. .. warning::
  820. Pandas uses PyTables for reading and writing HDF5 files, which allows
  821. serializing object-dtype data with pickle when using the "fixed" format.
  822. Loading pickled data received from untrusted sources can be unsafe.
  823. See: https://docs.python.org/3/library/pickle.html for more.
  824. Parameters
  825. ----------
  826. key : str
  827. column : str
  828. The column of interest.
  829. start : int or None, default None
  830. stop : int or None, default None
  831. Raises
  832. ------
  833. raises KeyError if the column is not found (or key is not a valid
  834. store)
  835. raises ValueError if the column can not be extracted individually (it
  836. is part of a data block)
  837. """
  838. tbl = self.get_storer(key)
  839. if not isinstance(tbl, Table):
  840. raise TypeError("can only read_column with a table")
  841. return tbl.read_column(column=column, start=start, stop=stop)
  842. def select_as_multiple(
  843. self,
  844. keys,
  845. where=None,
  846. selector=None,
  847. columns=None,
  848. start=None,
  849. stop=None,
  850. iterator: bool = False,
  851. chunksize: int | None = None,
  852. auto_close: bool = False,
  853. ):
  854. """
  855. Retrieve pandas objects from multiple tables.
  856. .. warning::
  857. Pandas uses PyTables for reading and writing HDF5 files, which allows
  858. serializing object-dtype data with pickle when using the "fixed" format.
  859. Loading pickled data received from untrusted sources can be unsafe.
  860. See: https://docs.python.org/3/library/pickle.html for more.
  861. Parameters
  862. ----------
  863. keys : a list of the tables
  864. selector : the table to apply the where criteria (defaults to keys[0]
  865. if not supplied)
  866. columns : the columns I want back
  867. start : integer (defaults to None), row number to start selection
  868. stop : integer (defaults to None), row number to stop selection
  869. iterator : bool, return an iterator, default False
  870. chunksize : nrows to include in iteration, return an iterator
  871. auto_close : bool, default False
  872. Should automatically close the store when finished.
  873. Raises
  874. ------
  875. raises KeyError if keys or selector is not found or keys is empty
  876. raises TypeError if keys is not a list or tuple
  877. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  878. """
  879. # default to single select
  880. where = _ensure_term(where, scope_level=1)
  881. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  882. keys = keys[0]
  883. if isinstance(keys, str):
  884. return self.select(
  885. key=keys,
  886. where=where,
  887. columns=columns,
  888. start=start,
  889. stop=stop,
  890. iterator=iterator,
  891. chunksize=chunksize,
  892. auto_close=auto_close,
  893. )
  894. if not isinstance(keys, (list, tuple)):
  895. raise TypeError("keys must be a list/tuple")
  896. if not len(keys):
  897. raise ValueError("keys must have a non-zero length")
  898. if selector is None:
  899. selector = keys[0]
  900. # collect the tables
  901. tbls = [self.get_storer(k) for k in keys]
  902. s = self.get_storer(selector)
  903. # validate rows
  904. nrows = None
  905. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  906. if t is None:
  907. raise KeyError(f"Invalid table [{k}]")
  908. if not t.is_table:
  909. raise TypeError(
  910. f"object [{t.pathname}] is not a table, and cannot be used in all "
  911. "select as multiple"
  912. )
  913. if nrows is None:
  914. nrows = t.nrows
  915. elif t.nrows != nrows:
  916. raise ValueError("all tables must have exactly the same nrows!")
  917. # The isinstance checks here are redundant with the check above,
  918. # but necessary for mypy; see GH#29757
  919. _tbls = [x for x in tbls if isinstance(x, Table)]
  920. # axis is the concentration axes
  921. axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
  922. def func(_start, _stop, _where):
  923. # retrieve the objs, _where is always passed as a set of
  924. # coordinates here
  925. objs = [
  926. t.read(where=_where, columns=columns, start=_start, stop=_stop)
  927. for t in tbls
  928. ]
  929. # concat and return
  930. return concat(objs, axis=axis, verify_integrity=False)._consolidate()
  931. # create the iterator
  932. it = TableIterator(
  933. self,
  934. s,
  935. func,
  936. where=where,
  937. nrows=nrows,
  938. start=start,
  939. stop=stop,
  940. iterator=iterator,
  941. chunksize=chunksize,
  942. auto_close=auto_close,
  943. )
  944. return it.get_result(coordinates=True)
  945. def put(
  946. self,
  947. key: str,
  948. value: DataFrame | Series,
  949. format=None,
  950. index: bool = True,
  951. append: bool = False,
  952. complib=None,
  953. complevel: int | None = None,
  954. min_itemsize: int | dict[str, int] | None = None,
  955. nan_rep=None,
  956. data_columns: Literal[True] | list[str] | None = None,
  957. encoding=None,
  958. errors: str = "strict",
  959. track_times: bool = True,
  960. dropna: bool = False,
  961. ) -> None:
  962. """
  963. Store object in HDFStore.
  964. Parameters
  965. ----------
  966. key : str
  967. value : {Series, DataFrame}
  968. format : 'fixed(f)|table(t)', default is 'fixed'
  969. Format to use when storing object in HDFStore. Value can be one of:
  970. ``'fixed'``
  971. Fixed format. Fast writing/reading. Not-appendable, nor searchable.
  972. ``'table'``
  973. Table format. Write as a PyTables Table structure which may perform
  974. worse but allow more flexible operations like searching / selecting
  975. subsets of the data.
  976. index : bool, default True
  977. Write DataFrame index as a column.
  978. append : bool, default False
  979. This will force Table format, append the input data to the existing.
  980. data_columns : list of columns or True, default None
  981. List of columns to create as data columns, or True to use all columns.
  982. See `here
  983. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  984. encoding : str, default None
  985. Provide an encoding for strings.
  986. track_times : bool, default True
  987. Parameter is propagated to 'create_table' method of 'PyTables'.
  988. If set to False it enables to have the same h5 files (same hashes)
  989. independent on creation time.
  990. dropna : bool, default False, optional
  991. Remove missing values.
  992. Examples
  993. --------
  994. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  995. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  996. >>> store.put('data', df) # doctest: +SKIP
  997. """
  998. if format is None:
  999. format = get_option("io.hdf.default_format") or "fixed"
  1000. format = self._validate_format(format)
  1001. self._write_to_group(
  1002. key,
  1003. value,
  1004. format=format,
  1005. index=index,
  1006. append=append,
  1007. complib=complib,
  1008. complevel=complevel,
  1009. min_itemsize=min_itemsize,
  1010. nan_rep=nan_rep,
  1011. data_columns=data_columns,
  1012. encoding=encoding,
  1013. errors=errors,
  1014. track_times=track_times,
  1015. dropna=dropna,
  1016. )
  1017. def remove(self, key: str, where=None, start=None, stop=None) -> None:
  1018. """
  1019. Remove pandas object partially by specifying the where condition
  1020. Parameters
  1021. ----------
  1022. key : str
  1023. Node to remove or delete rows from
  1024. where : list of Term (or convertible) objects, optional
  1025. start : integer (defaults to None), row number to start selection
  1026. stop : integer (defaults to None), row number to stop selection
  1027. Returns
  1028. -------
  1029. number of rows removed (or None if not a Table)
  1030. Raises
  1031. ------
  1032. raises KeyError if key is not a valid store
  1033. """
  1034. where = _ensure_term(where, scope_level=1)
  1035. try:
  1036. s = self.get_storer(key)
  1037. except KeyError:
  1038. # the key is not a valid store, re-raising KeyError
  1039. raise
  1040. except AssertionError:
  1041. # surface any assertion errors for e.g. debugging
  1042. raise
  1043. except Exception as err:
  1044. # In tests we get here with ClosedFileError, TypeError, and
  1045. # _table_mod.NoSuchNodeError. TODO: Catch only these?
  1046. if where is not None:
  1047. raise ValueError(
  1048. "trying to remove a node with a non-None where clause!"
  1049. ) from err
  1050. # we are actually trying to remove a node (with children)
  1051. node = self.get_node(key)
  1052. if node is not None:
  1053. node._f_remove(recursive=True)
  1054. return None
  1055. # remove the node
  1056. if com.all_none(where, start, stop):
  1057. s.group._f_remove(recursive=True)
  1058. # delete from the table
  1059. else:
  1060. if not s.is_table:
  1061. raise ValueError(
  1062. "can only remove with where on objects written as tables"
  1063. )
  1064. return s.delete(where=where, start=start, stop=stop)
  1065. def append(
  1066. self,
  1067. key: str,
  1068. value: DataFrame | Series,
  1069. format=None,
  1070. axes=None,
  1071. index: bool | list[str] = True,
  1072. append: bool = True,
  1073. complib=None,
  1074. complevel: int | None = None,
  1075. columns=None,
  1076. min_itemsize: int | dict[str, int] | None = None,
  1077. nan_rep=None,
  1078. chunksize: int | None = None,
  1079. expectedrows=None,
  1080. dropna: bool | None = None,
  1081. data_columns: Literal[True] | list[str] | None = None,
  1082. encoding=None,
  1083. errors: str = "strict",
  1084. ) -> None:
  1085. """
  1086. Append to Table in file.
  1087. Node must already exist and be Table format.
  1088. Parameters
  1089. ----------
  1090. key : str
  1091. value : {Series, DataFrame}
  1092. format : 'table' is the default
  1093. Format to use when storing object in HDFStore. Value can be one of:
  1094. ``'table'``
  1095. Table format. Write as a PyTables Table structure which may perform
  1096. worse but allow more flexible operations like searching / selecting
  1097. subsets of the data.
  1098. index : bool, default True
  1099. Write DataFrame index as a column.
  1100. append : bool, default True
  1101. Append the input data to the existing.
  1102. data_columns : list of columns, or True, default None
  1103. List of columns to create as indexed data columns for on-disk
  1104. queries, or True to use all columns. By default only the axes
  1105. of the object are indexed. See `here
  1106. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  1107. min_itemsize : dict of columns that specify minimum str sizes
  1108. nan_rep : str to use as str nan representation
  1109. chunksize : size to chunk the writing
  1110. expectedrows : expected TOTAL row size of this table
  1111. encoding : default None, provide an encoding for str
  1112. dropna : bool, default False, optional
  1113. Do not write an ALL nan row to the store settable
  1114. by the option 'io.hdf.dropna_table'.
  1115. Notes
  1116. -----
  1117. Does *not* check if data being appended overlaps with existing
  1118. data in the table, so be careful
  1119. Examples
  1120. --------
  1121. >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  1122. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  1123. >>> store.put('data', df1, format='table') # doctest: +SKIP
  1124. >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])
  1125. >>> store.append('data', df2) # doctest: +SKIP
  1126. >>> store.close() # doctest: +SKIP
  1127. A B
  1128. 0 1 2
  1129. 1 3 4
  1130. 0 5 6
  1131. 1 7 8
  1132. """
  1133. if columns is not None:
  1134. raise TypeError(
  1135. "columns is not a supported keyword in append, try data_columns"
  1136. )
  1137. if dropna is None:
  1138. dropna = get_option("io.hdf.dropna_table")
  1139. if format is None:
  1140. format = get_option("io.hdf.default_format") or "table"
  1141. format = self._validate_format(format)
  1142. self._write_to_group(
  1143. key,
  1144. value,
  1145. format=format,
  1146. axes=axes,
  1147. index=index,
  1148. append=append,
  1149. complib=complib,
  1150. complevel=complevel,
  1151. min_itemsize=min_itemsize,
  1152. nan_rep=nan_rep,
  1153. chunksize=chunksize,
  1154. expectedrows=expectedrows,
  1155. dropna=dropna,
  1156. data_columns=data_columns,
  1157. encoding=encoding,
  1158. errors=errors,
  1159. )
  1160. def append_to_multiple(
  1161. self,
  1162. d: dict,
  1163. value,
  1164. selector,
  1165. data_columns=None,
  1166. axes=None,
  1167. dropna: bool = False,
  1168. **kwargs,
  1169. ) -> None:
  1170. """
  1171. Append to multiple tables
  1172. Parameters
  1173. ----------
  1174. d : a dict of table_name to table_columns, None is acceptable as the
  1175. values of one node (this will get all the remaining columns)
  1176. value : a pandas object
  1177. selector : a string that designates the indexable table; all of its
  1178. columns will be designed as data_columns, unless data_columns is
  1179. passed, in which case these are used
  1180. data_columns : list of columns to create as data columns, or True to
  1181. use all columns
  1182. dropna : if evaluates to True, drop rows from all tables if any single
  1183. row in each table has all NaN. Default False.
  1184. Notes
  1185. -----
  1186. axes parameter is currently not accepted
  1187. """
  1188. if axes is not None:
  1189. raise TypeError(
  1190. "axes is currently not accepted as a parameter to append_to_multiple; "
  1191. "you can create the tables independently instead"
  1192. )
  1193. if not isinstance(d, dict):
  1194. raise ValueError(
  1195. "append_to_multiple must have a dictionary specified as the "
  1196. "way to split the value"
  1197. )
  1198. if selector not in d:
  1199. raise ValueError(
  1200. "append_to_multiple requires a selector that is in passed dict"
  1201. )
  1202. # figure out the splitting axis (the non_index_axis)
  1203. axis = next(iter(set(range(value.ndim)) - set(_AXES_MAP[type(value)])))
  1204. # figure out how to split the value
  1205. remain_key = None
  1206. remain_values: list = []
  1207. for k, v in d.items():
  1208. if v is None:
  1209. if remain_key is not None:
  1210. raise ValueError(
  1211. "append_to_multiple can only have one value in d that is None"
  1212. )
  1213. remain_key = k
  1214. else:
  1215. remain_values.extend(v)
  1216. if remain_key is not None:
  1217. ordered = value.axes[axis]
  1218. ordd = ordered.difference(Index(remain_values))
  1219. ordd = sorted(ordered.get_indexer(ordd))
  1220. d[remain_key] = ordered.take(ordd)
  1221. # data_columns
  1222. if data_columns is None:
  1223. data_columns = d[selector]
  1224. # ensure rows are synchronized across the tables
  1225. if dropna:
  1226. idxs = (value[cols].dropna(how="all").index for cols in d.values())
  1227. valid_index = next(idxs)
  1228. for index in idxs:
  1229. valid_index = valid_index.intersection(index)
  1230. value = value.loc[valid_index]
  1231. min_itemsize = kwargs.pop("min_itemsize", None)
  1232. # append
  1233. for k, v in d.items():
  1234. dc = data_columns if k == selector else None
  1235. # compute the val
  1236. val = value.reindex(v, axis=axis)
  1237. filtered = (
  1238. {key: value for (key, value) in min_itemsize.items() if key in v}
  1239. if min_itemsize is not None
  1240. else None
  1241. )
  1242. self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
  1243. def create_table_index(
  1244. self,
  1245. key: str,
  1246. columns=None,
  1247. optlevel: int | None = None,
  1248. kind: str | None = None,
  1249. ) -> None:
  1250. """
  1251. Create a pytables index on the table.
  1252. Parameters
  1253. ----------
  1254. key : str
  1255. columns : None, bool, or listlike[str]
  1256. Indicate which columns to create an index on.
  1257. * False : Do not create any indexes.
  1258. * True : Create indexes on all columns.
  1259. * None : Create indexes on all columns.
  1260. * listlike : Create indexes on the given columns.
  1261. optlevel : int or None, default None
  1262. Optimization level, if None, pytables defaults to 6.
  1263. kind : str or None, default None
  1264. Kind of index, if None, pytables defaults to "medium".
  1265. Raises
  1266. ------
  1267. TypeError: raises if the node is not a table
  1268. """
  1269. # version requirements
  1270. _tables()
  1271. s = self.get_storer(key)
  1272. if s is None:
  1273. return
  1274. if not isinstance(s, Table):
  1275. raise TypeError("cannot create table index on a Fixed format store")
  1276. s.create_index(columns=columns, optlevel=optlevel, kind=kind)
  1277. def groups(self) -> list:
  1278. """
  1279. Return a list of all the top-level nodes.
  1280. Each node returned is not a pandas storage object.
  1281. Returns
  1282. -------
  1283. list
  1284. List of objects.
  1285. Examples
  1286. --------
  1287. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  1288. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  1289. >>> store.put('data', df) # doctest: +SKIP
  1290. >>> print(store.groups()) # doctest: +SKIP
  1291. >>> store.close() # doctest: +SKIP
  1292. [/data (Group) ''
  1293. children := ['axis0' (Array), 'axis1' (Array), 'block0_values' (Array),
  1294. 'block0_items' (Array)]]
  1295. """
  1296. _tables()
  1297. self._check_if_open()
  1298. assert self._handle is not None # for mypy
  1299. assert _table_mod is not None # for mypy
  1300. return [
  1301. g
  1302. for g in self._handle.walk_groups()
  1303. if (
  1304. not isinstance(g, _table_mod.link.Link)
  1305. and (
  1306. getattr(g._v_attrs, "pandas_type", None)
  1307. or getattr(g, "table", None)
  1308. or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
  1309. )
  1310. )
  1311. ]
  1312. def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
  1313. """
  1314. Walk the pytables group hierarchy for pandas objects.
  1315. This generator will yield the group path, subgroups and pandas object
  1316. names for each group.
  1317. Any non-pandas PyTables objects that are not a group will be ignored.
  1318. The `where` group itself is listed first (preorder), then each of its
  1319. child groups (following an alphanumerical order) is also traversed,
  1320. following the same procedure.
  1321. Parameters
  1322. ----------
  1323. where : str, default "/"
  1324. Group where to start walking.
  1325. Yields
  1326. ------
  1327. path : str
  1328. Full path to a group (without trailing '/').
  1329. groups : list
  1330. Names (strings) of the groups contained in `path`.
  1331. leaves : list
  1332. Names (strings) of the pandas objects contained in `path`.
  1333. Examples
  1334. --------
  1335. >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  1336. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  1337. >>> store.put('data', df1, format='table') # doctest: +SKIP
  1338. >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])
  1339. >>> store.append('data', df2) # doctest: +SKIP
  1340. >>> store.close() # doctest: +SKIP
  1341. >>> for group in store.walk(): # doctest: +SKIP
  1342. ... print(group) # doctest: +SKIP
  1343. >>> store.close() # doctest: +SKIP
  1344. """
  1345. _tables()
  1346. self._check_if_open()
  1347. assert self._handle is not None # for mypy
  1348. assert _table_mod is not None # for mypy
  1349. for g in self._handle.walk_groups(where):
  1350. if getattr(g._v_attrs, "pandas_type", None) is not None:
  1351. continue
  1352. groups = []
  1353. leaves = []
  1354. for child in g._v_children.values():
  1355. pandas_type = getattr(child._v_attrs, "pandas_type", None)
  1356. if pandas_type is None:
  1357. if isinstance(child, _table_mod.group.Group):
  1358. groups.append(child._v_name)
  1359. else:
  1360. leaves.append(child._v_name)
  1361. yield (g._v_pathname.rstrip("/"), groups, leaves)
  1362. def get_node(self, key: str) -> Node | None:
  1363. """return the node with the key or None if it does not exist"""
  1364. self._check_if_open()
  1365. if not key.startswith("/"):
  1366. key = "/" + key
  1367. assert self._handle is not None
  1368. assert _table_mod is not None # for mypy
  1369. try:
  1370. node = self._handle.get_node(self.root, key)
  1371. except _table_mod.exceptions.NoSuchNodeError:
  1372. return None
  1373. assert isinstance(node, _table_mod.Node), type(node)
  1374. return node
  1375. def get_storer(self, key: str) -> GenericFixed | Table:
  1376. """return the storer object for a key, raise if not in the file"""
  1377. group = self.get_node(key)
  1378. if group is None:
  1379. raise KeyError(f"No object named {key} in the file")
  1380. s = self._create_storer(group)
  1381. s.infer_axes()
  1382. return s
  1383. def copy(
  1384. self,
  1385. file,
  1386. mode: str = "w",
  1387. propindexes: bool = True,
  1388. keys=None,
  1389. complib=None,
  1390. complevel: int | None = None,
  1391. fletcher32: bool = False,
  1392. overwrite: bool = True,
  1393. ) -> HDFStore:
  1394. """
  1395. Copy the existing store to a new file, updating in place.
  1396. Parameters
  1397. ----------
  1398. propindexes : bool, default True
  1399. Restore indexes in copied file.
  1400. keys : list, optional
  1401. List of keys to include in the copy (defaults to all).
  1402. overwrite : bool, default True
  1403. Whether to overwrite (remove and replace) existing nodes in the new store.
  1404. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  1405. Returns
  1406. -------
  1407. open file handle of the new store
  1408. """
  1409. new_store = HDFStore(
  1410. file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
  1411. )
  1412. if keys is None:
  1413. keys = list(self.keys())
  1414. if not isinstance(keys, (tuple, list)):
  1415. keys = [keys]
  1416. for k in keys:
  1417. s = self.get_storer(k)
  1418. if s is not None:
  1419. if k in new_store:
  1420. if overwrite:
  1421. new_store.remove(k)
  1422. data = self.select(k)
  1423. if isinstance(s, Table):
  1424. index: bool | list[str] = False
  1425. if propindexes:
  1426. index = [a.name for a in s.axes if a.is_indexed]
  1427. new_store.append(
  1428. k,
  1429. data,
  1430. index=index,
  1431. data_columns=getattr(s, "data_columns", None),
  1432. encoding=s.encoding,
  1433. )
  1434. else:
  1435. new_store.put(k, data, encoding=s.encoding)
  1436. return new_store
  1437. def info(self) -> str:
  1438. """
  1439. Print detailed information on the store.
  1440. Returns
  1441. -------
  1442. str
  1443. Examples
  1444. --------
  1445. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
  1446. >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
  1447. >>> store.put('data', df) # doctest: +SKIP
  1448. >>> print(store.info()) # doctest: +SKIP
  1449. >>> store.close() # doctest: +SKIP
  1450. <class 'pandas.io.pytables.HDFStore'>
  1451. File path: store.h5
  1452. /data frame (shape->[2,2])
  1453. """
  1454. path = pprint_thing(self._path)
  1455. output = f"{type(self)}\nFile path: {path}\n"
  1456. if self.is_open:
  1457. lkeys = sorted(self.keys())
  1458. if len(lkeys):
  1459. keys = []
  1460. values = []
  1461. for k in lkeys:
  1462. try:
  1463. s = self.get_storer(k)
  1464. if s is not None:
  1465. keys.append(pprint_thing(s.pathname or k))
  1466. values.append(pprint_thing(s or "invalid_HDFStore node"))
  1467. except AssertionError:
  1468. # surface any assertion errors for e.g. debugging
  1469. raise
  1470. except Exception as detail:
  1471. keys.append(k)
  1472. dstr = pprint_thing(detail)
  1473. values.append(f"[invalid_HDFStore node: {dstr}]")
  1474. output += adjoin(12, keys, values)
  1475. else:
  1476. output += "Empty"
  1477. else:
  1478. output += "File is CLOSED"
  1479. return output
  1480. # ------------------------------------------------------------------------
  1481. # private methods
  1482. def _check_if_open(self) -> None:
  1483. if not self.is_open:
  1484. raise ClosedFileError(f"{self._path} file is not open!")
  1485. def _validate_format(self, format: str) -> str:
  1486. """validate / deprecate formats"""
  1487. # validate
  1488. try:
  1489. format = _FORMAT_MAP[format.lower()]
  1490. except KeyError as err:
  1491. raise TypeError(f"invalid HDFStore format specified [{format}]") from err
  1492. return format
  1493. def _create_storer(
  1494. self,
  1495. group,
  1496. format=None,
  1497. value: DataFrame | Series | None = None,
  1498. encoding: str = "UTF-8",
  1499. errors: str = "strict",
  1500. ) -> GenericFixed | Table:
  1501. """return a suitable class to operate"""
  1502. cls: type[GenericFixed | Table]
  1503. if value is not None and not isinstance(value, (Series, DataFrame)):
  1504. raise TypeError("value must be None, Series, or DataFrame")
  1505. pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
  1506. tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
  1507. # infer the pt from the passed value
  1508. if pt is None:
  1509. if value is None:
  1510. _tables()
  1511. assert _table_mod is not None # for mypy
  1512. if getattr(group, "table", None) or isinstance(
  1513. group, _table_mod.table.Table
  1514. ):
  1515. pt = "frame_table"
  1516. tt = "generic_table"
  1517. else:
  1518. raise TypeError(
  1519. "cannot create a storer if the object is not existing "
  1520. "nor a value are passed"
  1521. )
  1522. else:
  1523. if isinstance(value, Series):
  1524. pt = "series"
  1525. else:
  1526. pt = "frame"
  1527. # we are actually a table
  1528. if format == "table":
  1529. pt += "_table"
  1530. # a storer node
  1531. if "table" not in pt:
  1532. _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
  1533. try:
  1534. cls = _STORER_MAP[pt]
  1535. except KeyError as err:
  1536. raise TypeError(
  1537. f"cannot properly create the storer for: [_STORER_MAP] [group->"
  1538. f"{group},value->{type(value)},format->{format}"
  1539. ) from err
  1540. return cls(self, group, encoding=encoding, errors=errors)
  1541. # existing node (and must be a table)
  1542. if tt is None:
  1543. # if we are a writer, determine the tt
  1544. if value is not None:
  1545. if pt == "series_table":
  1546. index = getattr(value, "index", None)
  1547. if index is not None:
  1548. if index.nlevels == 1:
  1549. tt = "appendable_series"
  1550. elif index.nlevels > 1:
  1551. tt = "appendable_multiseries"
  1552. elif pt == "frame_table":
  1553. index = getattr(value, "index", None)
  1554. if index is not None:
  1555. if index.nlevels == 1:
  1556. tt = "appendable_frame"
  1557. elif index.nlevels > 1:
  1558. tt = "appendable_multiframe"
  1559. _TABLE_MAP = {
  1560. "generic_table": GenericTable,
  1561. "appendable_series": AppendableSeriesTable,
  1562. "appendable_multiseries": AppendableMultiSeriesTable,
  1563. "appendable_frame": AppendableFrameTable,
  1564. "appendable_multiframe": AppendableMultiFrameTable,
  1565. "worm": WORMTable,
  1566. }
  1567. try:
  1568. cls = _TABLE_MAP[tt]
  1569. except KeyError as err:
  1570. raise TypeError(
  1571. f"cannot properly create the storer for: [_TABLE_MAP] [group->"
  1572. f"{group},value->{type(value)},format->{format}"
  1573. ) from err
  1574. return cls(self, group, encoding=encoding, errors=errors)
  1575. def _write_to_group(
  1576. self,
  1577. key: str,
  1578. value: DataFrame | Series,
  1579. format,
  1580. axes=None,
  1581. index: bool | list[str] = True,
  1582. append: bool = False,
  1583. complib=None,
  1584. complevel: int | None = None,
  1585. fletcher32=None,
  1586. min_itemsize: int | dict[str, int] | None = None,
  1587. chunksize: int | None = None,
  1588. expectedrows=None,
  1589. dropna: bool = False,
  1590. nan_rep=None,
  1591. data_columns=None,
  1592. encoding=None,
  1593. errors: str = "strict",
  1594. track_times: bool = True,
  1595. ) -> None:
  1596. # we don't want to store a table node at all if our object is 0-len
  1597. # as there are not dtypes
  1598. if getattr(value, "empty", None) and (format == "table" or append):
  1599. return
  1600. group = self._identify_group(key, append)
  1601. s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
  1602. if append:
  1603. # raise if we are trying to append to a Fixed format,
  1604. # or a table that exists (and we are putting)
  1605. if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
  1606. raise ValueError("Can only append to Tables")
  1607. if not s.is_exists:
  1608. s.set_object_info()
  1609. else:
  1610. s.set_object_info()
  1611. if not s.is_table and complib:
  1612. raise ValueError("Compression not supported on Fixed format stores")
  1613. # write the object
  1614. s.write(
  1615. obj=value,
  1616. axes=axes,
  1617. append=append,
  1618. complib=complib,
  1619. complevel=complevel,
  1620. fletcher32=fletcher32,
  1621. min_itemsize=min_itemsize,
  1622. chunksize=chunksize,
  1623. expectedrows=expectedrows,
  1624. dropna=dropna,
  1625. nan_rep=nan_rep,
  1626. data_columns=data_columns,
  1627. track_times=track_times,
  1628. )
  1629. if isinstance(s, Table) and index:
  1630. s.create_index(columns=index)
  1631. def _read_group(self, group: Node):
  1632. s = self._create_storer(group)
  1633. s.infer_axes()
  1634. return s.read()
  1635. def _identify_group(self, key: str, append: bool) -> Node:
  1636. """Identify HDF5 group based on key, delete/create group if needed."""
  1637. group = self.get_node(key)
  1638. # we make this assertion for mypy; the get_node call will already
  1639. # have raised if this is incorrect
  1640. assert self._handle is not None
  1641. # remove the node if we are not appending
  1642. if group is not None and not append:
  1643. self._handle.remove_node(group, recursive=True)
  1644. group = None
  1645. if group is None:
  1646. group = self._create_nodes_and_group(key)
  1647. return group
  1648. def _create_nodes_and_group(self, key: str) -> Node:
  1649. """Create nodes from key and return group name."""
  1650. # assertion for mypy
  1651. assert self._handle is not None
  1652. paths = key.split("/")
  1653. # recursively create the groups
  1654. path = "/"
  1655. for p in paths:
  1656. if not len(p):
  1657. continue
  1658. new_path = path
  1659. if not path.endswith("/"):
  1660. new_path += "/"
  1661. new_path += p
  1662. group = self.get_node(new_path)
  1663. if group is None:
  1664. group = self._handle.create_group(path, p)
  1665. path = new_path
  1666. return group
  1667. class TableIterator:
  1668. """
  1669. Define the iteration interface on a table
  1670. Parameters
  1671. ----------
  1672. store : HDFStore
  1673. s : the referred storer
  1674. func : the function to execute the query
  1675. where : the where of the query
  1676. nrows : the rows to iterate on
  1677. start : the passed start value (default is None)
  1678. stop : the passed stop value (default is None)
  1679. iterator : bool, default False
  1680. Whether to use the default iterator.
  1681. chunksize : the passed chunking value (default is 100000)
  1682. auto_close : bool, default False
  1683. Whether to automatically close the store at the end of iteration.
  1684. """
  1685. chunksize: int | None
  1686. store: HDFStore
  1687. s: GenericFixed | Table
  1688. def __init__(
  1689. self,
  1690. store: HDFStore,
  1691. s: GenericFixed | Table,
  1692. func,
  1693. where,
  1694. nrows,
  1695. start=None,
  1696. stop=None,
  1697. iterator: bool = False,
  1698. chunksize: int | None = None,
  1699. auto_close: bool = False,
  1700. ) -> None:
  1701. self.store = store
  1702. self.s = s
  1703. self.func = func
  1704. self.where = where
  1705. # set start/stop if they are not set if we are a table
  1706. if self.s.is_table:
  1707. if nrows is None:
  1708. nrows = 0
  1709. if start is None:
  1710. start = 0
  1711. if stop is None:
  1712. stop = nrows
  1713. stop = min(nrows, stop)
  1714. self.nrows = nrows
  1715. self.start = start
  1716. self.stop = stop
  1717. self.coordinates = None
  1718. if iterator or chunksize is not None:
  1719. if chunksize is None:
  1720. chunksize = 100000
  1721. self.chunksize = int(chunksize)
  1722. else:
  1723. self.chunksize = None
  1724. self.auto_close = auto_close
  1725. def __iter__(self) -> Iterator:
  1726. # iterate
  1727. current = self.start
  1728. if self.coordinates is None:
  1729. raise ValueError("Cannot iterate until get_result is called.")
  1730. while current < self.stop:
  1731. stop = min(current + self.chunksize, self.stop)
  1732. value = self.func(None, None, self.coordinates[current:stop])
  1733. current = stop
  1734. if value is None or not len(value):
  1735. continue
  1736. yield value
  1737. self.close()
  1738. def close(self) -> None:
  1739. if self.auto_close:
  1740. self.store.close()
  1741. def get_result(self, coordinates: bool = False):
  1742. # return the actual iterator
  1743. if self.chunksize is not None:
  1744. if not isinstance(self.s, Table):
  1745. raise TypeError("can only use an iterator or chunksize on a table")
  1746. self.coordinates = self.s.read_coordinates(where=self.where)
  1747. return self
  1748. # if specified read via coordinates (necessary for multiple selections
  1749. if coordinates:
  1750. if not isinstance(self.s, Table):
  1751. raise TypeError("can only read_coordinates on a table")
  1752. where = self.s.read_coordinates(
  1753. where=self.where, start=self.start, stop=self.stop
  1754. )
  1755. else:
  1756. where = self.where
  1757. # directly return the result
  1758. results = self.func(self.start, self.stop, where)
  1759. self.close()
  1760. return results
  1761. class IndexCol:
  1762. """
  1763. an index column description class
  1764. Parameters
  1765. ----------
  1766. axis : axis which I reference
  1767. values : the ndarray like converted values
  1768. kind : a string description of this type
  1769. typ : the pytables type
  1770. pos : the position in the pytables
  1771. """
  1772. is_an_indexable: bool = True
  1773. is_data_indexable: bool = True
  1774. _info_fields = ["freq", "tz", "index_name"]
  1775. def __init__(
  1776. self,
  1777. name: str,
  1778. values=None,
  1779. kind=None,
  1780. typ=None,
  1781. cname: str | None = None,
  1782. axis=None,
  1783. pos=None,
  1784. freq=None,
  1785. tz=None,
  1786. index_name=None,
  1787. ordered=None,
  1788. table=None,
  1789. meta=None,
  1790. metadata=None,
  1791. ) -> None:
  1792. if not isinstance(name, str):
  1793. raise ValueError("`name` must be a str.")
  1794. self.values = values
  1795. self.kind = kind
  1796. self.typ = typ
  1797. self.name = name
  1798. self.cname = cname or name
  1799. self.axis = axis
  1800. self.pos = pos
  1801. self.freq = freq
  1802. self.tz = tz
  1803. self.index_name = index_name
  1804. self.ordered = ordered
  1805. self.table = table
  1806. self.meta = meta
  1807. self.metadata = metadata
  1808. if pos is not None:
  1809. self.set_pos(pos)
  1810. # These are ensured as long as the passed arguments match the
  1811. # constructor annotations.
  1812. assert isinstance(self.name, str)
  1813. assert isinstance(self.cname, str)
  1814. @property
  1815. def itemsize(self) -> int:
  1816. # Assumes self.typ has already been initialized
  1817. return self.typ.itemsize
  1818. @property
  1819. def kind_attr(self) -> str:
  1820. return f"{self.name}_kind"
  1821. def set_pos(self, pos: int) -> None:
  1822. """set the position of this column in the Table"""
  1823. self.pos = pos
  1824. if pos is not None and self.typ is not None:
  1825. self.typ._v_pos = pos
  1826. def __repr__(self) -> str:
  1827. temp = tuple(
  1828. map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
  1829. )
  1830. return ",".join(
  1831. [
  1832. f"{key}->{value}"
  1833. for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
  1834. ]
  1835. )
  1836. def __eq__(self, other: object) -> bool:
  1837. """compare 2 col items"""
  1838. return all(
  1839. getattr(self, a, None) == getattr(other, a, None)
  1840. for a in ["name", "cname", "axis", "pos"]
  1841. )
  1842. def __ne__(self, other) -> bool:
  1843. return not self.__eq__(other)
  1844. @property
  1845. def is_indexed(self) -> bool:
  1846. """return whether I am an indexed column"""
  1847. if not hasattr(self.table, "cols"):
  1848. # e.g. if infer hasn't been called yet, self.table will be None.
  1849. return False
  1850. return getattr(self.table.cols, self.cname).is_indexed
  1851. def convert(
  1852. self, values: np.ndarray, nan_rep, encoding: str, errors: str
  1853. ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
  1854. """
  1855. Convert the data from this selection to the appropriate pandas type.
  1856. """
  1857. assert isinstance(values, np.ndarray), type(values)
  1858. # values is a recarray
  1859. if values.dtype.fields is not None:
  1860. # Copy, otherwise values will be a view
  1861. # preventing the original recarry from being free'ed
  1862. values = values[self.cname].copy()
  1863. val_kind = _ensure_decoded(self.kind)
  1864. values = _maybe_convert(values, val_kind, encoding, errors)
  1865. kwargs = {}
  1866. kwargs["name"] = _ensure_decoded(self.index_name)
  1867. if self.freq is not None:
  1868. kwargs["freq"] = _ensure_decoded(self.freq)
  1869. factory: type[Index | DatetimeIndex] = Index
  1870. if lib.is_np_dtype(values.dtype, "M") or isinstance(
  1871. values.dtype, DatetimeTZDtype
  1872. ):
  1873. factory = DatetimeIndex
  1874. elif values.dtype == "i8" and "freq" in kwargs:
  1875. # PeriodIndex data is stored as i8
  1876. # error: Incompatible types in assignment (expression has type
  1877. # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
  1878. # "Union[Type[Index], Type[DatetimeIndex]]")
  1879. factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment]
  1880. x, freq=kwds.get("freq", None)
  1881. )._rename(
  1882. kwds["name"]
  1883. )
  1884. # making an Index instance could throw a number of different errors
  1885. try:
  1886. new_pd_index = factory(values, **kwargs)
  1887. except UnicodeEncodeError as err:
  1888. if (
  1889. errors == "surrogatepass"
  1890. and get_option("future.infer_string")
  1891. and str(err).endswith("surrogates not allowed")
  1892. and HAS_PYARROW
  1893. ):
  1894. new_pd_index = factory(
  1895. values,
  1896. dtype=StringDtype(storage="python", na_value=np.nan),
  1897. **kwargs,
  1898. )
  1899. else:
  1900. raise
  1901. except ValueError:
  1902. # if the output freq is different that what we recorded,
  1903. # it should be None (see also 'doc example part 2')
  1904. if "freq" in kwargs:
  1905. kwargs["freq"] = None
  1906. new_pd_index = factory(values, **kwargs)
  1907. final_pd_index = _set_tz(new_pd_index, self.tz)
  1908. return final_pd_index, final_pd_index
  1909. def take_data(self):
  1910. """return the values"""
  1911. return self.values
  1912. @property
  1913. def attrs(self):
  1914. return self.table._v_attrs
  1915. @property
  1916. def description(self):
  1917. return self.table.description
  1918. @property
  1919. def col(self):
  1920. """return my current col description"""
  1921. return getattr(self.description, self.cname, None)
  1922. @property
  1923. def cvalues(self):
  1924. """return my cython values"""
  1925. return self.values
  1926. def __iter__(self) -> Iterator:
  1927. return iter(self.values)
  1928. def maybe_set_size(self, min_itemsize=None) -> None:
  1929. """
  1930. maybe set a string col itemsize:
  1931. min_itemsize can be an integer or a dict with this columns name
  1932. with an integer size
  1933. """
  1934. if _ensure_decoded(self.kind) == "string":
  1935. if isinstance(min_itemsize, dict):
  1936. min_itemsize = min_itemsize.get(self.name)
  1937. if min_itemsize is not None and self.typ.itemsize < min_itemsize:
  1938. self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
  1939. def validate_names(self) -> None:
  1940. pass
  1941. def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
  1942. self.table = handler.table
  1943. self.validate_col()
  1944. self.validate_attr(append)
  1945. self.validate_metadata(handler)
  1946. self.write_metadata(handler)
  1947. self.set_attr()
  1948. def validate_col(self, itemsize=None):
  1949. """validate this column: return the compared against itemsize"""
  1950. # validate this column for string truncation (or reset to the max size)
  1951. if _ensure_decoded(self.kind) == "string":
  1952. c = self.col
  1953. if c is not None:
  1954. if itemsize is None:
  1955. itemsize = self.itemsize
  1956. if c.itemsize < itemsize:
  1957. raise ValueError(
  1958. f"Trying to store a string with len [{itemsize}] in "
  1959. f"[{self.cname}] column but\nthis column has a limit of "
  1960. f"[{c.itemsize}]!\nConsider using min_itemsize to "
  1961. "preset the sizes on these columns"
  1962. )
  1963. return c.itemsize
  1964. return None
  1965. def validate_attr(self, append: bool) -> None:
  1966. # check for backwards incompatibility
  1967. if append:
  1968. existing_kind = getattr(self.attrs, self.kind_attr, None)
  1969. if existing_kind is not None and existing_kind != self.kind:
  1970. raise TypeError(
  1971. f"incompatible kind in col [{existing_kind} - {self.kind}]"
  1972. )
  1973. def update_info(self, info) -> None:
  1974. """
  1975. set/update the info for this indexable with the key/value
  1976. if there is a conflict raise/warn as needed
  1977. """
  1978. for key in self._info_fields:
  1979. value = getattr(self, key, None)
  1980. idx = info.setdefault(self.name, {})
  1981. existing_value = idx.get(key)
  1982. if key in idx and value is not None and existing_value != value:
  1983. # frequency/name just warn
  1984. if key in ["freq", "index_name"]:
  1985. ws = attribute_conflict_doc % (key, existing_value, value)
  1986. warnings.warn(
  1987. ws, AttributeConflictWarning, stacklevel=find_stack_level()
  1988. )
  1989. # reset
  1990. idx[key] = None
  1991. setattr(self, key, None)
  1992. else:
  1993. raise ValueError(
  1994. f"invalid info for [{self.name}] for [{key}], "
  1995. f"existing_value [{existing_value}] conflicts with "
  1996. f"new value [{value}]"
  1997. )
  1998. elif value is not None or existing_value is not None:
  1999. idx[key] = value
  2000. def set_info(self, info) -> None:
  2001. """set my state from the passed info"""
  2002. idx = info.get(self.name)
  2003. if idx is not None:
  2004. self.__dict__.update(idx)
  2005. def set_attr(self) -> None:
  2006. """set the kind for this column"""
  2007. setattr(self.attrs, self.kind_attr, self.kind)
  2008. def validate_metadata(self, handler: AppendableTable) -> None:
  2009. """validate that kind=category does not change the categories"""
  2010. if self.meta == "category":
  2011. new_metadata = self.metadata
  2012. cur_metadata = handler.read_metadata(self.cname)
  2013. if (
  2014. new_metadata is not None
  2015. and cur_metadata is not None
  2016. and not array_equivalent(
  2017. new_metadata, cur_metadata, strict_nan=True, dtype_equal=True
  2018. )
  2019. ):
  2020. raise ValueError(
  2021. "cannot append a categorical with "
  2022. "different categories to the existing"
  2023. )
  2024. def write_metadata(self, handler: AppendableTable) -> None:
  2025. """set the meta data"""
  2026. if self.metadata is not None:
  2027. handler.write_metadata(self.cname, self.metadata)
  2028. class GenericIndexCol(IndexCol):
  2029. """an index which is not represented in the data of the table"""
  2030. @property
  2031. def is_indexed(self) -> bool:
  2032. return False
  2033. def convert(
  2034. self, values: np.ndarray, nan_rep, encoding: str, errors: str
  2035. ) -> tuple[Index, Index]:
  2036. """
  2037. Convert the data from this selection to the appropriate pandas type.
  2038. Parameters
  2039. ----------
  2040. values : np.ndarray
  2041. nan_rep : str
  2042. encoding : str
  2043. errors : str
  2044. """
  2045. assert isinstance(values, np.ndarray), type(values)
  2046. index = RangeIndex(len(values))
  2047. return index, index
  2048. def set_attr(self) -> None:
  2049. pass
  2050. class DataCol(IndexCol):
  2051. """
  2052. a data holding column, by definition this is not indexable
  2053. Parameters
  2054. ----------
  2055. data : the actual data
  2056. cname : the column name in the table to hold the data (typically
  2057. values)
  2058. meta : a string description of the metadata
  2059. metadata : the actual metadata
  2060. """
  2061. is_an_indexable = False
  2062. is_data_indexable = False
  2063. _info_fields = ["tz", "ordered"]
  2064. def __init__(
  2065. self,
  2066. name: str,
  2067. values=None,
  2068. kind=None,
  2069. typ=None,
  2070. cname: str | None = None,
  2071. pos=None,
  2072. tz=None,
  2073. ordered=None,
  2074. table=None,
  2075. meta=None,
  2076. metadata=None,
  2077. dtype: DtypeArg | None = None,
  2078. data=None,
  2079. ) -> None:
  2080. super().__init__(
  2081. name=name,
  2082. values=values,
  2083. kind=kind,
  2084. typ=typ,
  2085. pos=pos,
  2086. cname=cname,
  2087. tz=tz,
  2088. ordered=ordered,
  2089. table=table,
  2090. meta=meta,
  2091. metadata=metadata,
  2092. )
  2093. self.dtype = dtype
  2094. self.data = data
  2095. @property
  2096. def dtype_attr(self) -> str:
  2097. return f"{self.name}_dtype"
  2098. @property
  2099. def meta_attr(self) -> str:
  2100. return f"{self.name}_meta"
  2101. def __repr__(self) -> str:
  2102. temp = tuple(
  2103. map(
  2104. pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
  2105. )
  2106. )
  2107. return ",".join(
  2108. [
  2109. f"{key}->{value}"
  2110. for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
  2111. ]
  2112. )
  2113. def __eq__(self, other: object) -> bool:
  2114. """compare 2 col items"""
  2115. return all(
  2116. getattr(self, a, None) == getattr(other, a, None)
  2117. for a in ["name", "cname", "dtype", "pos"]
  2118. )
  2119. def set_data(self, data: ArrayLike) -> None:
  2120. assert data is not None
  2121. assert self.dtype is None
  2122. data, dtype_name = _get_data_and_dtype_name(data)
  2123. self.data = data
  2124. self.dtype = dtype_name
  2125. self.kind = _dtype_to_kind(dtype_name)
  2126. def take_data(self):
  2127. """return the data"""
  2128. return self.data
  2129. @classmethod
  2130. def _get_atom(cls, values: ArrayLike) -> Col:
  2131. """
  2132. Get an appropriately typed and shaped pytables.Col object for values.
  2133. """
  2134. dtype = values.dtype
  2135. # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
  2136. # attribute "itemsize"
  2137. itemsize = dtype.itemsize # type: ignore[union-attr]
  2138. shape = values.shape
  2139. if values.ndim == 1:
  2140. # EA, use block shape pretending it is 2D
  2141. # TODO(EA2D): not necessary with 2D EAs
  2142. shape = (1, values.size)
  2143. if isinstance(values, Categorical):
  2144. codes = values.codes
  2145. atom = cls.get_atom_data(shape, kind=codes.dtype.name)
  2146. elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
  2147. atom = cls.get_atom_datetime64(shape)
  2148. elif lib.is_np_dtype(dtype, "m"):
  2149. atom = cls.get_atom_timedelta64(shape)
  2150. elif is_complex_dtype(dtype):
  2151. atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
  2152. elif is_string_dtype(dtype):
  2153. atom = cls.get_atom_string(shape, itemsize)
  2154. else:
  2155. atom = cls.get_atom_data(shape, kind=dtype.name)
  2156. return atom
  2157. @classmethod
  2158. def get_atom_string(cls, shape, itemsize):
  2159. return _tables().StringCol(itemsize=itemsize, shape=shape[0])
  2160. @classmethod
  2161. def get_atom_coltype(cls, kind: str) -> type[Col]:
  2162. """return the PyTables column class for this column"""
  2163. if kind.startswith("uint"):
  2164. k4 = kind[4:]
  2165. col_name = f"UInt{k4}Col"
  2166. elif kind.startswith("period"):
  2167. # we store as integer
  2168. col_name = "Int64Col"
  2169. else:
  2170. kcap = kind.capitalize()
  2171. col_name = f"{kcap}Col"
  2172. return getattr(_tables(), col_name)
  2173. @classmethod
  2174. def get_atom_data(cls, shape, kind: str) -> Col:
  2175. return cls.get_atom_coltype(kind=kind)(shape=shape[0])
  2176. @classmethod
  2177. def get_atom_datetime64(cls, shape):
  2178. return _tables().Int64Col(shape=shape[0])
  2179. @classmethod
  2180. def get_atom_timedelta64(cls, shape):
  2181. return _tables().Int64Col(shape=shape[0])
  2182. @property
  2183. def shape(self):
  2184. return getattr(self.data, "shape", None)
  2185. @property
  2186. def cvalues(self):
  2187. """return my cython values"""
  2188. return self.data
  2189. def validate_attr(self, append) -> None:
  2190. """validate that we have the same order as the existing & same dtype"""
  2191. if append:
  2192. existing_fields = getattr(self.attrs, self.kind_attr, None)
  2193. if existing_fields is not None and existing_fields != list(self.values):
  2194. raise ValueError("appended items do not match existing items in table!")
  2195. existing_dtype = getattr(self.attrs, self.dtype_attr, None)
  2196. if existing_dtype is not None and existing_dtype != self.dtype:
  2197. raise ValueError(
  2198. "appended items dtype do not match existing items dtype in table!"
  2199. )
  2200. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  2201. """
  2202. Convert the data from this selection to the appropriate pandas type.
  2203. Parameters
  2204. ----------
  2205. values : np.ndarray
  2206. nan_rep :
  2207. encoding : str
  2208. errors : str
  2209. Returns
  2210. -------
  2211. index : listlike to become an Index
  2212. data : ndarraylike to become a column
  2213. """
  2214. assert isinstance(values, np.ndarray), type(values)
  2215. # values is a recarray
  2216. if values.dtype.fields is not None:
  2217. values = values[self.cname]
  2218. assert self.typ is not None
  2219. if self.dtype is None:
  2220. # Note: in tests we never have timedelta64 or datetime64,
  2221. # so the _get_data_and_dtype_name may be unnecessary
  2222. converted, dtype_name = _get_data_and_dtype_name(values)
  2223. kind = _dtype_to_kind(dtype_name)
  2224. else:
  2225. converted = values
  2226. dtype_name = self.dtype
  2227. kind = self.kind
  2228. assert isinstance(converted, np.ndarray) # for mypy
  2229. # use the meta if needed
  2230. meta = _ensure_decoded(self.meta)
  2231. metadata = self.metadata
  2232. ordered = self.ordered
  2233. tz = self.tz
  2234. assert dtype_name is not None
  2235. # convert to the correct dtype
  2236. dtype = _ensure_decoded(dtype_name)
  2237. # reverse converts
  2238. if dtype.startswith("datetime64"):
  2239. # recreate with tz if indicated
  2240. converted = _set_tz(converted, tz, coerce=True)
  2241. elif dtype == "timedelta64":
  2242. converted = np.asarray(converted, dtype="m8[ns]")
  2243. elif dtype == "date":
  2244. try:
  2245. converted = np.asarray(
  2246. [date.fromordinal(v) for v in converted], dtype=object
  2247. )
  2248. except ValueError:
  2249. converted = np.asarray(
  2250. [date.fromtimestamp(v) for v in converted], dtype=object
  2251. )
  2252. elif meta == "category":
  2253. # we have a categorical
  2254. categories = metadata
  2255. codes = converted.ravel()
  2256. # if we have stored a NaN in the categories
  2257. # then strip it; in theory we could have BOTH
  2258. # -1s in the codes and nulls :<
  2259. if categories is None:
  2260. # Handle case of NaN-only categorical columns in which case
  2261. # the categories are an empty array; when this is stored,
  2262. # pytables cannot write a zero-len array, so on readback
  2263. # the categories would be None and `read_hdf()` would fail.
  2264. categories = Index([], dtype=np.float64)
  2265. else:
  2266. mask = isna(categories)
  2267. if mask.any():
  2268. categories = categories[~mask]
  2269. codes[codes != -1] -= mask.astype(int).cumsum()._values
  2270. converted = Categorical.from_codes(
  2271. codes, categories=categories, ordered=ordered, validate=False
  2272. )
  2273. else:
  2274. try:
  2275. converted = converted.astype(dtype, copy=False)
  2276. except TypeError:
  2277. converted = converted.astype("O", copy=False)
  2278. # convert nans / decode
  2279. if _ensure_decoded(kind) == "string":
  2280. converted = _unconvert_string_array(
  2281. converted, nan_rep=nan_rep, encoding=encoding, errors=errors
  2282. )
  2283. return self.values, converted
  2284. def set_attr(self) -> None:
  2285. """set the data for this column"""
  2286. setattr(self.attrs, self.kind_attr, self.values)
  2287. setattr(self.attrs, self.meta_attr, self.meta)
  2288. assert self.dtype is not None
  2289. setattr(self.attrs, self.dtype_attr, self.dtype)
  2290. class DataIndexableCol(DataCol):
  2291. """represent a data column that can be indexed"""
  2292. is_data_indexable = True
  2293. def validate_names(self) -> None:
  2294. if not is_string_dtype(Index(self.values).dtype):
  2295. # TODO: should the message here be more specifically non-str?
  2296. raise ValueError("cannot have non-object label DataIndexableCol")
  2297. @classmethod
  2298. def get_atom_string(cls, shape, itemsize):
  2299. return _tables().StringCol(itemsize=itemsize)
  2300. @classmethod
  2301. def get_atom_data(cls, shape, kind: str) -> Col:
  2302. return cls.get_atom_coltype(kind=kind)()
  2303. @classmethod
  2304. def get_atom_datetime64(cls, shape):
  2305. return _tables().Int64Col()
  2306. @classmethod
  2307. def get_atom_timedelta64(cls, shape):
  2308. return _tables().Int64Col()
  2309. class GenericDataIndexableCol(DataIndexableCol):
  2310. """represent a generic pytables data column"""
  2311. class Fixed:
  2312. """
  2313. represent an object in my store
  2314. facilitate read/write of various types of objects
  2315. this is an abstract base class
  2316. Parameters
  2317. ----------
  2318. parent : HDFStore
  2319. group : Node
  2320. The group node where the table resides.
  2321. """
  2322. pandas_kind: str
  2323. format_type: str = "fixed" # GH#30962 needed by dask
  2324. obj_type: type[DataFrame | Series]
  2325. ndim: int
  2326. parent: HDFStore
  2327. is_table: bool = False
  2328. def __init__(
  2329. self,
  2330. parent: HDFStore,
  2331. group: Node,
  2332. encoding: str | None = "UTF-8",
  2333. errors: str = "strict",
  2334. ) -> None:
  2335. assert isinstance(parent, HDFStore), type(parent)
  2336. assert _table_mod is not None # needed for mypy
  2337. assert isinstance(group, _table_mod.Node), type(group)
  2338. self.parent = parent
  2339. self.group = group
  2340. self.encoding = _ensure_encoding(encoding)
  2341. self.errors = errors
  2342. @property
  2343. def is_old_version(self) -> bool:
  2344. return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
  2345. @property
  2346. def version(self) -> tuple[int, int, int]:
  2347. """compute and set our version"""
  2348. version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
  2349. try:
  2350. version = tuple(int(x) for x in version.split("."))
  2351. if len(version) == 2:
  2352. version = version + (0,)
  2353. except AttributeError:
  2354. version = (0, 0, 0)
  2355. return version
  2356. @property
  2357. def pandas_type(self):
  2358. return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
  2359. def __repr__(self) -> str:
  2360. """return a pretty representation of myself"""
  2361. self.infer_axes()
  2362. s = self.shape
  2363. if s is not None:
  2364. if isinstance(s, (list, tuple)):
  2365. jshape = ",".join([pprint_thing(x) for x in s])
  2366. s = f"[{jshape}]"
  2367. return f"{self.pandas_type:12.12} (shape->{s})"
  2368. return self.pandas_type
  2369. def set_object_info(self) -> None:
  2370. """set my pandas type & version"""
  2371. self.attrs.pandas_type = str(self.pandas_kind)
  2372. self.attrs.pandas_version = str(_version)
  2373. def copy(self) -> Fixed:
  2374. new_self = copy.copy(self)
  2375. return new_self
  2376. @property
  2377. def shape(self):
  2378. return self.nrows
  2379. @property
  2380. def pathname(self):
  2381. return self.group._v_pathname
  2382. @property
  2383. def _handle(self):
  2384. return self.parent._handle
  2385. @property
  2386. def _filters(self):
  2387. return self.parent._filters
  2388. @property
  2389. def _complevel(self) -> int:
  2390. return self.parent._complevel
  2391. @property
  2392. def _fletcher32(self) -> bool:
  2393. return self.parent._fletcher32
  2394. @property
  2395. def attrs(self):
  2396. return self.group._v_attrs
  2397. def set_attrs(self) -> None:
  2398. """set our object attributes"""
  2399. def get_attrs(self) -> None:
  2400. """get our object attributes"""
  2401. @property
  2402. def storable(self):
  2403. """return my storable"""
  2404. return self.group
  2405. @property
  2406. def is_exists(self) -> bool:
  2407. return False
  2408. @property
  2409. def nrows(self):
  2410. return getattr(self.storable, "nrows", None)
  2411. def validate(self, other) -> Literal[True] | None:
  2412. """validate against an existing storable"""
  2413. if other is None:
  2414. return None
  2415. return True
  2416. def validate_version(self, where=None) -> None:
  2417. """are we trying to operate on an old version?"""
  2418. def infer_axes(self) -> bool:
  2419. """
  2420. infer the axes of my storer
  2421. return a boolean indicating if we have a valid storer or not
  2422. """
  2423. s = self.storable
  2424. if s is None:
  2425. return False
  2426. self.get_attrs()
  2427. return True
  2428. def read(
  2429. self,
  2430. where=None,
  2431. columns=None,
  2432. start: int | None = None,
  2433. stop: int | None = None,
  2434. ):
  2435. raise NotImplementedError(
  2436. "cannot read on an abstract storer: subclasses should implement"
  2437. )
  2438. def write(self, obj, **kwargs) -> None:
  2439. raise NotImplementedError(
  2440. "cannot write on an abstract storer: subclasses should implement"
  2441. )
  2442. def delete(
  2443. self, where=None, start: int | None = None, stop: int | None = None
  2444. ) -> None:
  2445. """
  2446. support fully deleting the node in its entirety (only) - where
  2447. specification must be None
  2448. """
  2449. if com.all_none(where, start, stop):
  2450. self._handle.remove_node(self.group, recursive=True)
  2451. return None
  2452. raise TypeError("cannot delete on an abstract storer")
  2453. class GenericFixed(Fixed):
  2454. """a generified fixed version"""
  2455. _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
  2456. _reverse_index_map = {v: k for k, v in _index_type_map.items()}
  2457. attributes: list[str] = []
  2458. # indexer helpers
  2459. def _class_to_alias(self, cls) -> str:
  2460. return self._index_type_map.get(cls, "")
  2461. def _alias_to_class(self, alias):
  2462. if isinstance(alias, type): # pragma: no cover
  2463. # compat: for a short period of time master stored types
  2464. return alias
  2465. return self._reverse_index_map.get(alias, Index)
  2466. def _get_index_factory(self, attrs):
  2467. index_class = self._alias_to_class(
  2468. _ensure_decoded(getattr(attrs, "index_class", ""))
  2469. )
  2470. factory: Callable
  2471. if index_class == DatetimeIndex:
  2472. def f(values, freq=None, tz=None):
  2473. # data are already in UTC, localize and convert if tz present
  2474. dta = DatetimeArray._simple_new(
  2475. values.values, dtype=values.dtype, freq=freq
  2476. )
  2477. result = DatetimeIndex._simple_new(dta, name=None)
  2478. if tz is not None:
  2479. result = result.tz_localize("UTC").tz_convert(tz)
  2480. return result
  2481. factory = f
  2482. elif index_class == PeriodIndex:
  2483. def f(values, freq=None, tz=None):
  2484. dtype = PeriodDtype(freq)
  2485. parr = PeriodArray._simple_new(values, dtype=dtype)
  2486. return PeriodIndex._simple_new(parr, name=None)
  2487. factory = f
  2488. else:
  2489. factory = index_class
  2490. kwargs = {}
  2491. if "freq" in attrs:
  2492. kwargs["freq"] = attrs["freq"]
  2493. if index_class is Index:
  2494. # DTI/PI would be gotten by _alias_to_class
  2495. factory = TimedeltaIndex
  2496. if "tz" in attrs:
  2497. if isinstance(attrs["tz"], bytes):
  2498. # created by python2
  2499. kwargs["tz"] = attrs["tz"].decode("utf-8")
  2500. else:
  2501. # created by python3
  2502. kwargs["tz"] = attrs["tz"]
  2503. assert index_class is DatetimeIndex # just checking
  2504. return factory, kwargs
  2505. def validate_read(self, columns, where) -> None:
  2506. """
  2507. raise if any keywords are passed which are not-None
  2508. """
  2509. if columns is not None:
  2510. raise TypeError(
  2511. "cannot pass a column specification when reading "
  2512. "a Fixed format store. this store must be selected in its entirety"
  2513. )
  2514. if where is not None:
  2515. raise TypeError(
  2516. "cannot pass a where specification when reading "
  2517. "from a Fixed format store. this store must be selected in its entirety"
  2518. )
  2519. @property
  2520. def is_exists(self) -> bool:
  2521. return True
  2522. def set_attrs(self) -> None:
  2523. """set our object attributes"""
  2524. self.attrs.encoding = self.encoding
  2525. self.attrs.errors = self.errors
  2526. def get_attrs(self) -> None:
  2527. """retrieve our attributes"""
  2528. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2529. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2530. for n in self.attributes:
  2531. setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
  2532. def write(self, obj, **kwargs) -> None:
  2533. self.set_attrs()
  2534. def read_array(self, key: str, start: int | None = None, stop: int | None = None):
  2535. """read an array for the specified node (off of group"""
  2536. import tables
  2537. node = getattr(self.group, key)
  2538. attrs = node._v_attrs
  2539. transposed = getattr(attrs, "transposed", False)
  2540. if isinstance(node, tables.VLArray):
  2541. ret = node[0][start:stop]
  2542. dtype = getattr(attrs, "value_type", None)
  2543. if dtype is not None:
  2544. ret = pd_array(ret, dtype=dtype)
  2545. else:
  2546. dtype = _ensure_decoded(getattr(attrs, "value_type", None))
  2547. shape = getattr(attrs, "shape", None)
  2548. if shape is not None:
  2549. # length 0 axis
  2550. ret = np.empty(shape, dtype=dtype)
  2551. else:
  2552. ret = node[start:stop]
  2553. if dtype and dtype.startswith("datetime64"):
  2554. # reconstruct a timezone if indicated
  2555. tz = getattr(attrs, "tz", None)
  2556. ret = _set_tz(ret, tz, coerce=True)
  2557. elif dtype == "timedelta64":
  2558. ret = np.asarray(ret, dtype="m8[ns]")
  2559. if transposed:
  2560. return ret.T
  2561. else:
  2562. return ret
  2563. def read_index(
  2564. self, key: str, start: int | None = None, stop: int | None = None
  2565. ) -> Index:
  2566. variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
  2567. if variety == "multi":
  2568. return self.read_multi_index(key, start=start, stop=stop)
  2569. elif variety == "regular":
  2570. node = getattr(self.group, key)
  2571. index = self.read_index_node(node, start=start, stop=stop)
  2572. return index
  2573. else: # pragma: no cover
  2574. raise TypeError(f"unrecognized index variety: {variety}")
  2575. def write_index(self, key: str, index: Index) -> None:
  2576. if isinstance(index, MultiIndex):
  2577. setattr(self.attrs, f"{key}_variety", "multi")
  2578. self.write_multi_index(key, index)
  2579. else:
  2580. setattr(self.attrs, f"{key}_variety", "regular")
  2581. converted = _convert_index("index", index, self.encoding, self.errors)
  2582. self.write_array(key, converted.values)
  2583. node = getattr(self.group, key)
  2584. node._v_attrs.kind = converted.kind
  2585. node._v_attrs.name = index.name
  2586. if isinstance(index, (DatetimeIndex, PeriodIndex)):
  2587. node._v_attrs.index_class = self._class_to_alias(type(index))
  2588. if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
  2589. node._v_attrs.freq = index.freq
  2590. if isinstance(index, DatetimeIndex) and index.tz is not None:
  2591. node._v_attrs.tz = _get_tz(index.tz)
  2592. def write_multi_index(self, key: str, index: MultiIndex) -> None:
  2593. setattr(self.attrs, f"{key}_nlevels", index.nlevels)
  2594. for i, (lev, level_codes, name) in enumerate(
  2595. zip(index.levels, index.codes, index.names)
  2596. ):
  2597. # write the level
  2598. if isinstance(lev.dtype, ExtensionDtype):
  2599. raise NotImplementedError(
  2600. "Saving a MultiIndex with an extension dtype is not supported."
  2601. )
  2602. level_key = f"{key}_level{i}"
  2603. conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
  2604. self.write_array(level_key, conv_level.values)
  2605. node = getattr(self.group, level_key)
  2606. node._v_attrs.kind = conv_level.kind
  2607. node._v_attrs.name = name
  2608. # write the name
  2609. setattr(node._v_attrs, f"{key}_name{name}", name)
  2610. # write the labels
  2611. label_key = f"{key}_label{i}"
  2612. self.write_array(label_key, level_codes)
  2613. def read_multi_index(
  2614. self, key: str, start: int | None = None, stop: int | None = None
  2615. ) -> MultiIndex:
  2616. nlevels = getattr(self.attrs, f"{key}_nlevels")
  2617. levels = []
  2618. codes = []
  2619. names: list[Hashable] = []
  2620. for i in range(nlevels):
  2621. level_key = f"{key}_level{i}"
  2622. node = getattr(self.group, level_key)
  2623. lev = self.read_index_node(node, start=start, stop=stop)
  2624. levels.append(lev)
  2625. names.append(lev.name)
  2626. label_key = f"{key}_label{i}"
  2627. level_codes = self.read_array(label_key, start=start, stop=stop)
  2628. codes.append(level_codes)
  2629. return MultiIndex(
  2630. levels=levels, codes=codes, names=names, verify_integrity=True
  2631. )
  2632. def read_index_node(
  2633. self, node: Node, start: int | None = None, stop: int | None = None
  2634. ) -> Index:
  2635. data = node[start:stop]
  2636. # If the index was an empty array write_array_empty() will
  2637. # have written a sentinel. Here we replace it with the original.
  2638. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
  2639. data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
  2640. kind = _ensure_decoded(node._v_attrs.kind)
  2641. name = None
  2642. if "name" in node._v_attrs:
  2643. name = _ensure_str(node._v_attrs.name)
  2644. name = _ensure_decoded(name)
  2645. attrs = node._v_attrs
  2646. factory, kwargs = self._get_index_factory(attrs)
  2647. if kind in ("date", "object"):
  2648. index = factory(
  2649. _unconvert_index(
  2650. data, kind, encoding=self.encoding, errors=self.errors
  2651. ),
  2652. dtype=object,
  2653. **kwargs,
  2654. )
  2655. else:
  2656. try:
  2657. index = factory(
  2658. _unconvert_index(
  2659. data, kind, encoding=self.encoding, errors=self.errors
  2660. ),
  2661. **kwargs,
  2662. )
  2663. except UnicodeEncodeError as err:
  2664. if (
  2665. self.errors == "surrogatepass"
  2666. and get_option("future.infer_string")
  2667. and str(err).endswith("surrogates not allowed")
  2668. and HAS_PYARROW
  2669. ):
  2670. index = factory(
  2671. _unconvert_index(
  2672. data, kind, encoding=self.encoding, errors=self.errors
  2673. ),
  2674. dtype=StringDtype(storage="python", na_value=np.nan),
  2675. **kwargs,
  2676. )
  2677. else:
  2678. raise
  2679. index.name = name
  2680. return index
  2681. def write_array_empty(self, key: str, value: ArrayLike) -> None:
  2682. """write a 0-len array"""
  2683. # ugly hack for length 0 axes
  2684. arr = np.empty((1,) * value.ndim)
  2685. self._handle.create_array(self.group, key, arr)
  2686. node = getattr(self.group, key)
  2687. node._v_attrs.value_type = str(value.dtype)
  2688. node._v_attrs.shape = value.shape
  2689. def write_array(
  2690. self, key: str, obj: AnyArrayLike, items: Index | None = None
  2691. ) -> None:
  2692. # TODO: we only have a few tests that get here, the only EA
  2693. # that gets passed is DatetimeArray, and we never have
  2694. # both self._filters and EA
  2695. value = extract_array(obj, extract_numpy=True)
  2696. if key in self.group:
  2697. self._handle.remove_node(self.group, key)
  2698. # Transform needed to interface with pytables row/col notation
  2699. empty_array = value.size == 0
  2700. transposed = False
  2701. if isinstance(value.dtype, CategoricalDtype):
  2702. raise NotImplementedError(
  2703. "Cannot store a category dtype in a HDF5 dataset that uses format="
  2704. '"fixed". Use format="table".'
  2705. )
  2706. if not empty_array:
  2707. if hasattr(value, "T"):
  2708. # ExtensionArrays (1d) may not have transpose.
  2709. value = value.T
  2710. transposed = True
  2711. atom = None
  2712. if self._filters is not None:
  2713. with suppress(ValueError):
  2714. # get the atom for this datatype
  2715. atom = _tables().Atom.from_dtype(value.dtype)
  2716. if atom is not None:
  2717. # We only get here if self._filters is non-None and
  2718. # the Atom.from_dtype call succeeded
  2719. # create an empty chunked array and fill it from value
  2720. if not empty_array:
  2721. ca = self._handle.create_carray(
  2722. self.group, key, atom, value.shape, filters=self._filters
  2723. )
  2724. ca[:] = value
  2725. else:
  2726. self.write_array_empty(key, value)
  2727. elif value.dtype.type == np.object_:
  2728. # infer the type, warn if we have a non-string type here (for
  2729. # performance)
  2730. inferred_type = lib.infer_dtype(value, skipna=False)
  2731. if empty_array:
  2732. pass
  2733. elif inferred_type == "string":
  2734. pass
  2735. else:
  2736. ws = performance_doc % (inferred_type, key, items)
  2737. warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
  2738. vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
  2739. vlarr.append(value)
  2740. elif lib.is_np_dtype(value.dtype, "M"):
  2741. self._handle.create_array(self.group, key, value.view("i8"))
  2742. getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
  2743. elif isinstance(value.dtype, DatetimeTZDtype):
  2744. # store as UTC
  2745. # with a zone
  2746. # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
  2747. # attribute "asi8"
  2748. self._handle.create_array(
  2749. self.group, key, value.asi8 # type: ignore[union-attr]
  2750. )
  2751. node = getattr(self.group, key)
  2752. # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
  2753. # attribute "tz"
  2754. node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
  2755. node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
  2756. elif lib.is_np_dtype(value.dtype, "m"):
  2757. self._handle.create_array(self.group, key, value.view("i8"))
  2758. getattr(self.group, key)._v_attrs.value_type = "timedelta64"
  2759. elif isinstance(value, BaseStringArray):
  2760. vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
  2761. vlarr.append(value.to_numpy())
  2762. node = getattr(self.group, key)
  2763. node._v_attrs.value_type = str(value.dtype)
  2764. elif empty_array:
  2765. self.write_array_empty(key, value)
  2766. else:
  2767. self._handle.create_array(self.group, key, value)
  2768. getattr(self.group, key)._v_attrs.transposed = transposed
  2769. class SeriesFixed(GenericFixed):
  2770. pandas_kind = "series"
  2771. attributes = ["name"]
  2772. name: Hashable
  2773. @property
  2774. def shape(self):
  2775. try:
  2776. return (len(self.group.values),)
  2777. except (TypeError, AttributeError):
  2778. return None
  2779. def read(
  2780. self,
  2781. where=None,
  2782. columns=None,
  2783. start: int | None = None,
  2784. stop: int | None = None,
  2785. ) -> Series:
  2786. self.validate_read(columns, where)
  2787. index = self.read_index("index", start=start, stop=stop)
  2788. values = self.read_array("values", start=start, stop=stop)
  2789. try:
  2790. result = Series(values, index=index, name=self.name, copy=False)
  2791. except UnicodeEncodeError as err:
  2792. if (
  2793. self.errors == "surrogatepass"
  2794. and get_option("future.infer_string")
  2795. and str(err).endswith("surrogates not allowed")
  2796. and HAS_PYARROW
  2797. ):
  2798. result = Series(
  2799. values,
  2800. index=index,
  2801. name=self.name,
  2802. copy=False,
  2803. dtype=StringDtype(storage="python", na_value=np.nan),
  2804. )
  2805. else:
  2806. raise
  2807. return result
  2808. def write(self, obj, **kwargs) -> None:
  2809. super().write(obj, **kwargs)
  2810. self.write_index("index", obj.index)
  2811. self.write_array("values", obj)
  2812. self.attrs.name = obj.name
  2813. class BlockManagerFixed(GenericFixed):
  2814. attributes = ["ndim", "nblocks"]
  2815. nblocks: int
  2816. @property
  2817. def shape(self) -> Shape | None:
  2818. try:
  2819. ndim = self.ndim
  2820. # items
  2821. items = 0
  2822. for i in range(self.nblocks):
  2823. node = getattr(self.group, f"block{i}_items")
  2824. shape = getattr(node, "shape", None)
  2825. if shape is not None:
  2826. items += shape[0]
  2827. # data shape
  2828. node = self.group.block0_values
  2829. shape = getattr(node, "shape", None)
  2830. if shape is not None:
  2831. shape = list(shape[0 : (ndim - 1)])
  2832. else:
  2833. shape = []
  2834. shape.append(items)
  2835. return shape
  2836. except AttributeError:
  2837. return None
  2838. def read(
  2839. self,
  2840. where=None,
  2841. columns=None,
  2842. start: int | None = None,
  2843. stop: int | None = None,
  2844. ) -> DataFrame:
  2845. # start, stop applied to rows, so 0th axis only
  2846. self.validate_read(columns, where)
  2847. select_axis = self.obj_type()._get_block_manager_axis(0)
  2848. axes = []
  2849. for i in range(self.ndim):
  2850. _start, _stop = (start, stop) if i == select_axis else (None, None)
  2851. ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
  2852. axes.append(ax)
  2853. items = axes[0]
  2854. dfs = []
  2855. for i in range(self.nblocks):
  2856. blk_items = self.read_index(f"block{i}_items")
  2857. values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
  2858. columns = items[items.get_indexer(blk_items)]
  2859. df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
  2860. if (
  2861. using_string_dtype()
  2862. and isinstance(values, np.ndarray)
  2863. and is_string_array(values, skipna=True)
  2864. ):
  2865. df = df.astype(StringDtype(na_value=np.nan))
  2866. dfs.append(df)
  2867. if len(dfs) > 0:
  2868. out = concat(dfs, axis=1, copy=True)
  2869. if using_copy_on_write():
  2870. # with CoW, concat ignores the copy keyword. Here, we still want
  2871. # to copy to enforce optimized column-major layout
  2872. out = out.copy()
  2873. out = out.reindex(columns=items, copy=False)
  2874. return out
  2875. return DataFrame(columns=axes[0], index=axes[1])
  2876. def write(self, obj, **kwargs) -> None:
  2877. super().write(obj, **kwargs)
  2878. # TODO(ArrayManager) HDFStore relies on accessing the blocks
  2879. if isinstance(obj._mgr, ArrayManager):
  2880. obj = obj._as_manager("block")
  2881. data = obj._mgr
  2882. if not data.is_consolidated():
  2883. data = data.consolidate()
  2884. self.attrs.ndim = data.ndim
  2885. for i, ax in enumerate(data.axes):
  2886. if i == 0 and (not ax.is_unique):
  2887. raise ValueError("Columns index has to be unique for fixed format")
  2888. self.write_index(f"axis{i}", ax)
  2889. # Supporting mixed-type DataFrame objects...nontrivial
  2890. self.attrs.nblocks = len(data.blocks)
  2891. for i, blk in enumerate(data.blocks):
  2892. # I have no idea why, but writing values before items fixed #2299
  2893. blk_items = data.items.take(blk.mgr_locs)
  2894. self.write_array(f"block{i}_values", blk.values, items=blk_items)
  2895. self.write_index(f"block{i}_items", blk_items)
  2896. class FrameFixed(BlockManagerFixed):
  2897. pandas_kind = "frame"
  2898. obj_type = DataFrame
  2899. class Table(Fixed):
  2900. """
  2901. represent a table:
  2902. facilitate read/write of various types of tables
  2903. Attrs in Table Node
  2904. -------------------
  2905. These are attributes that are store in the main table node, they are
  2906. necessary to recreate these tables when read back in.
  2907. index_axes : a list of tuples of the (original indexing axis and
  2908. index column)
  2909. non_index_axes: a list of tuples of the (original index axis and
  2910. columns on a non-indexing axis)
  2911. values_axes : a list of the columns which comprise the data of this
  2912. table
  2913. data_columns : a list of the columns that we are allowing indexing
  2914. (these become single columns in values_axes)
  2915. nan_rep : the string to use for nan representations for string
  2916. objects
  2917. levels : the names of levels
  2918. metadata : the names of the metadata columns
  2919. """
  2920. pandas_kind = "wide_table"
  2921. format_type: str = "table" # GH#30962 needed by dask
  2922. table_type: str
  2923. levels: int | list[Hashable] = 1
  2924. is_table = True
  2925. metadata: list
  2926. def __init__(
  2927. self,
  2928. parent: HDFStore,
  2929. group: Node,
  2930. encoding: str | None = None,
  2931. errors: str = "strict",
  2932. index_axes: list[IndexCol] | None = None,
  2933. non_index_axes: list[tuple[AxisInt, Any]] | None = None,
  2934. values_axes: list[DataCol] | None = None,
  2935. data_columns: list | None = None,
  2936. info: dict | None = None,
  2937. nan_rep=None,
  2938. ) -> None:
  2939. super().__init__(parent, group, encoding=encoding, errors=errors)
  2940. self.index_axes = index_axes or []
  2941. self.non_index_axes = non_index_axes or []
  2942. self.values_axes = values_axes or []
  2943. self.data_columns = data_columns or []
  2944. self.info = info or {}
  2945. self.nan_rep = nan_rep
  2946. @property
  2947. def table_type_short(self) -> str:
  2948. return self.table_type.split("_")[0]
  2949. def __repr__(self) -> str:
  2950. """return a pretty representation of myself"""
  2951. self.infer_axes()
  2952. jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
  2953. dc = f",dc->[{jdc}]"
  2954. ver = ""
  2955. if self.is_old_version:
  2956. jver = ".".join([str(x) for x in self.version])
  2957. ver = f"[{jver}]"
  2958. jindex_axes = ",".join([a.name for a in self.index_axes])
  2959. return (
  2960. f"{self.pandas_type:12.12}{ver} "
  2961. f"(typ->{self.table_type_short},nrows->{self.nrows},"
  2962. f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
  2963. )
  2964. def __getitem__(self, c: str):
  2965. """return the axis for c"""
  2966. for a in self.axes:
  2967. if c == a.name:
  2968. return a
  2969. return None
  2970. def validate(self, other) -> None:
  2971. """validate against an existing table"""
  2972. if other is None:
  2973. return
  2974. if other.table_type != self.table_type:
  2975. raise TypeError(
  2976. "incompatible table_type with existing "
  2977. f"[{other.table_type} - {self.table_type}]"
  2978. )
  2979. for c in ["index_axes", "non_index_axes", "values_axes"]:
  2980. sv = getattr(self, c, None)
  2981. ov = getattr(other, c, None)
  2982. if sv != ov:
  2983. # show the error for the specific axes
  2984. # Argument 1 to "enumerate" has incompatible type
  2985. # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
  2986. for i, sax in enumerate(sv): # type: ignore[arg-type]
  2987. # Value of type "Optional[Any]" is not indexable [index]
  2988. oax = ov[i] # type: ignore[index]
  2989. if sax != oax:
  2990. if c == "values_axes" and sax.kind != oax.kind:
  2991. raise ValueError(
  2992. f"Cannot serialize the column [{oax.values[0]}] "
  2993. f"because its data contents are not [{sax.kind}] "
  2994. f"but [{oax.kind}] object dtype"
  2995. )
  2996. raise ValueError(
  2997. f"invalid combination of [{c}] on appending data "
  2998. f"[{sax}] vs current table [{oax}]"
  2999. )
  3000. # should never get here
  3001. raise Exception(
  3002. f"invalid combination of [{c}] on appending data [{sv}] vs "
  3003. f"current table [{ov}]"
  3004. )
  3005. @property
  3006. def is_multi_index(self) -> bool:
  3007. """the levels attribute is 1 or a list in the case of a multi-index"""
  3008. return isinstance(self.levels, list)
  3009. def validate_multiindex(
  3010. self, obj: DataFrame | Series
  3011. ) -> tuple[DataFrame, list[Hashable]]:
  3012. """
  3013. validate that we can store the multi-index; reset and return the
  3014. new object
  3015. """
  3016. levels = com.fill_missing_names(obj.index.names)
  3017. try:
  3018. reset_obj = obj.reset_index()
  3019. except ValueError as err:
  3020. raise ValueError(
  3021. "duplicate names/columns in the multi-index when storing as a table"
  3022. ) from err
  3023. assert isinstance(reset_obj, DataFrame) # for mypy
  3024. return reset_obj, levels
  3025. @property
  3026. def nrows_expected(self) -> int:
  3027. """based on our axes, compute the expected nrows"""
  3028. return np.prod([i.cvalues.shape[0] for i in self.index_axes])
  3029. @property
  3030. def is_exists(self) -> bool:
  3031. """has this table been created"""
  3032. return "table" in self.group
  3033. @property
  3034. def storable(self):
  3035. return getattr(self.group, "table", None)
  3036. @property
  3037. def table(self):
  3038. """return the table group (this is my storable)"""
  3039. return self.storable
  3040. @property
  3041. def dtype(self):
  3042. return self.table.dtype
  3043. @property
  3044. def description(self):
  3045. return self.table.description
  3046. @property
  3047. def axes(self) -> itertools.chain[IndexCol]:
  3048. return itertools.chain(self.index_axes, self.values_axes)
  3049. @property
  3050. def ncols(self) -> int:
  3051. """the number of total columns in the values axes"""
  3052. return sum(len(a.values) for a in self.values_axes)
  3053. @property
  3054. def is_transposed(self) -> bool:
  3055. return False
  3056. @property
  3057. def data_orientation(self) -> tuple[int, ...]:
  3058. """return a tuple of my permutated axes, non_indexable at the front"""
  3059. return tuple(
  3060. itertools.chain(
  3061. [int(a[0]) for a in self.non_index_axes],
  3062. [int(a.axis) for a in self.index_axes],
  3063. )
  3064. )
  3065. def queryables(self) -> dict[str, Any]:
  3066. """return a dict of the kinds allowable columns for this object"""
  3067. # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
  3068. axis_names = {0: "index", 1: "columns"}
  3069. # compute the values_axes queryables
  3070. d1 = [(a.cname, a) for a in self.index_axes]
  3071. d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
  3072. d3 = [
  3073. (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
  3074. ]
  3075. return dict(d1 + d2 + d3)
  3076. def index_cols(self):
  3077. """return a list of my index cols"""
  3078. # Note: each `i.cname` below is assured to be a str.
  3079. return [(i.axis, i.cname) for i in self.index_axes]
  3080. def values_cols(self) -> list[str]:
  3081. """return a list of my values cols"""
  3082. return [i.cname for i in self.values_axes]
  3083. def _get_metadata_path(self, key: str) -> str:
  3084. """return the metadata pathname for this key"""
  3085. group = self.group._v_pathname
  3086. return f"{group}/meta/{key}/meta"
  3087. def write_metadata(self, key: str, values: np.ndarray) -> None:
  3088. """
  3089. Write out a metadata array to the key as a fixed-format Series.
  3090. Parameters
  3091. ----------
  3092. key : str
  3093. values : ndarray
  3094. """
  3095. self.parent.put(
  3096. self._get_metadata_path(key),
  3097. Series(values, copy=False),
  3098. format="table",
  3099. encoding=self.encoding,
  3100. errors=self.errors,
  3101. nan_rep=self.nan_rep,
  3102. )
  3103. def read_metadata(self, key: str):
  3104. """return the meta data array for this key"""
  3105. if getattr(getattr(self.group, "meta", None), key, None) is not None:
  3106. return self.parent.select(self._get_metadata_path(key))
  3107. return None
  3108. def set_attrs(self) -> None:
  3109. """set our table type & indexables"""
  3110. self.attrs.table_type = str(self.table_type)
  3111. self.attrs.index_cols = self.index_cols()
  3112. self.attrs.values_cols = self.values_cols()
  3113. self.attrs.non_index_axes = self.non_index_axes
  3114. self.attrs.data_columns = self.data_columns
  3115. self.attrs.nan_rep = self.nan_rep
  3116. self.attrs.encoding = self.encoding
  3117. self.attrs.errors = self.errors
  3118. self.attrs.levels = self.levels
  3119. self.attrs.info = self.info
  3120. def get_attrs(self) -> None:
  3121. """retrieve our attributes"""
  3122. self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
  3123. self.data_columns = getattr(self.attrs, "data_columns", None) or []
  3124. self.info = getattr(self.attrs, "info", None) or {}
  3125. self.nan_rep = getattr(self.attrs, "nan_rep", None)
  3126. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  3127. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  3128. self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
  3129. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  3130. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  3131. def validate_version(self, where=None) -> None:
  3132. """are we trying to operate on an old version?"""
  3133. if where is not None:
  3134. if self.is_old_version:
  3135. ws = incompatibility_doc % ".".join([str(x) for x in self.version])
  3136. warnings.warn(
  3137. ws,
  3138. IncompatibilityWarning,
  3139. stacklevel=find_stack_level(),
  3140. )
  3141. def validate_min_itemsize(self, min_itemsize) -> None:
  3142. """
  3143. validate the min_itemsize doesn't contain items that are not in the
  3144. axes this needs data_columns to be defined
  3145. """
  3146. if min_itemsize is None:
  3147. return
  3148. if not isinstance(min_itemsize, dict):
  3149. return
  3150. q = self.queryables()
  3151. for k in min_itemsize:
  3152. # ok, apply generally
  3153. if k == "values":
  3154. continue
  3155. if k not in q:
  3156. raise ValueError(
  3157. f"min_itemsize has the key [{k}] which is not an axis or "
  3158. "data_column"
  3159. )
  3160. @cache_readonly
  3161. def indexables(self):
  3162. """create/cache the indexables if they don't exist"""
  3163. _indexables = []
  3164. desc = self.description
  3165. table_attrs = self.table.attrs
  3166. # Note: each of the `name` kwargs below are str, ensured
  3167. # by the definition in index_cols.
  3168. # index columns
  3169. for i, (axis, name) in enumerate(self.attrs.index_cols):
  3170. atom = getattr(desc, name)
  3171. md = self.read_metadata(name)
  3172. meta = "category" if md is not None else None
  3173. kind_attr = f"{name}_kind"
  3174. kind = getattr(table_attrs, kind_attr, None)
  3175. index_col = IndexCol(
  3176. name=name,
  3177. axis=axis,
  3178. pos=i,
  3179. kind=kind,
  3180. typ=atom,
  3181. table=self.table,
  3182. meta=meta,
  3183. metadata=md,
  3184. )
  3185. _indexables.append(index_col)
  3186. # values columns
  3187. dc = set(self.data_columns)
  3188. base_pos = len(_indexables)
  3189. def f(i, c):
  3190. assert isinstance(c, str)
  3191. klass = DataCol
  3192. if c in dc:
  3193. klass = DataIndexableCol
  3194. atom = getattr(desc, c)
  3195. adj_name = _maybe_adjust_name(c, self.version)
  3196. # TODO: why kind_attr here?
  3197. values = getattr(table_attrs, f"{adj_name}_kind", None)
  3198. dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
  3199. # Argument 1 to "_dtype_to_kind" has incompatible type
  3200. # "Optional[Any]"; expected "str" [arg-type]
  3201. kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
  3202. md = self.read_metadata(c)
  3203. # TODO: figure out why these two versions of `meta` dont always match.
  3204. # meta = "category" if md is not None else None
  3205. meta = getattr(table_attrs, f"{adj_name}_meta", None)
  3206. obj = klass(
  3207. name=adj_name,
  3208. cname=c,
  3209. values=values,
  3210. kind=kind,
  3211. pos=base_pos + i,
  3212. typ=atom,
  3213. table=self.table,
  3214. meta=meta,
  3215. metadata=md,
  3216. dtype=dtype,
  3217. )
  3218. return obj
  3219. # Note: the definition of `values_cols` ensures that each
  3220. # `c` below is a str.
  3221. _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
  3222. return _indexables
  3223. def create_index(
  3224. self, columns=None, optlevel=None, kind: str | None = None
  3225. ) -> None:
  3226. """
  3227. Create a pytables index on the specified columns.
  3228. Parameters
  3229. ----------
  3230. columns : None, bool, or listlike[str]
  3231. Indicate which columns to create an index on.
  3232. * False : Do not create any indexes.
  3233. * True : Create indexes on all columns.
  3234. * None : Create indexes on all columns.
  3235. * listlike : Create indexes on the given columns.
  3236. optlevel : int or None, default None
  3237. Optimization level, if None, pytables defaults to 6.
  3238. kind : str or None, default None
  3239. Kind of index, if None, pytables defaults to "medium".
  3240. Raises
  3241. ------
  3242. TypeError if trying to create an index on a complex-type column.
  3243. Notes
  3244. -----
  3245. Cannot index Time64Col or ComplexCol.
  3246. Pytables must be >= 3.0.
  3247. """
  3248. if not self.infer_axes():
  3249. return
  3250. if columns is False:
  3251. return
  3252. # index all indexables and data_columns
  3253. if columns is None or columns is True:
  3254. columns = [a.cname for a in self.axes if a.is_data_indexable]
  3255. if not isinstance(columns, (tuple, list)):
  3256. columns = [columns]
  3257. kw = {}
  3258. if optlevel is not None:
  3259. kw["optlevel"] = optlevel
  3260. if kind is not None:
  3261. kw["kind"] = kind
  3262. table = self.table
  3263. for c in columns:
  3264. v = getattr(table.cols, c, None)
  3265. if v is not None:
  3266. # remove the index if the kind/optlevel have changed
  3267. if v.is_indexed:
  3268. index = v.index
  3269. cur_optlevel = index.optlevel
  3270. cur_kind = index.kind
  3271. if kind is not None and cur_kind != kind:
  3272. v.remove_index()
  3273. else:
  3274. kw["kind"] = cur_kind
  3275. if optlevel is not None and cur_optlevel != optlevel:
  3276. v.remove_index()
  3277. else:
  3278. kw["optlevel"] = cur_optlevel
  3279. # create the index
  3280. if not v.is_indexed:
  3281. if v.type.startswith("complex"):
  3282. raise TypeError(
  3283. "Columns containing complex values can be stored but "
  3284. "cannot be indexed when using table format. Either use "
  3285. "fixed format, set index=False, or do not include "
  3286. "the columns containing complex values to "
  3287. "data_columns when initializing the table."
  3288. )
  3289. v.create_index(**kw)
  3290. elif c in self.non_index_axes[0][1]:
  3291. # GH 28156
  3292. raise AttributeError(
  3293. f"column {c} is not a data_column.\n"
  3294. f"In order to read column {c} you must reload the dataframe \n"
  3295. f"into HDFStore and include {c} with the data_columns argument."
  3296. )
  3297. def _read_axes(
  3298. self, where, start: int | None = None, stop: int | None = None
  3299. ) -> list[tuple[np.ndarray, np.ndarray] | tuple[Index, Index]]:
  3300. """
  3301. Create the axes sniffed from the table.
  3302. Parameters
  3303. ----------
  3304. where : ???
  3305. start : int or None, default None
  3306. stop : int or None, default None
  3307. Returns
  3308. -------
  3309. List[Tuple[index_values, column_values]]
  3310. """
  3311. # create the selection
  3312. selection = Selection(self, where=where, start=start, stop=stop)
  3313. values = selection.select()
  3314. results = []
  3315. # convert the data
  3316. for a in self.axes:
  3317. a.set_info(self.info)
  3318. res = a.convert(
  3319. values,
  3320. nan_rep=self.nan_rep,
  3321. encoding=self.encoding,
  3322. errors=self.errors,
  3323. )
  3324. results.append(res)
  3325. return results
  3326. @classmethod
  3327. def get_object(cls, obj, transposed: bool):
  3328. """return the data for this obj"""
  3329. return obj
  3330. def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
  3331. """
  3332. take the input data_columns and min_itemize and create a data
  3333. columns spec
  3334. """
  3335. if not len(non_index_axes):
  3336. return []
  3337. axis, axis_labels = non_index_axes[0]
  3338. info = self.info.get(axis, {})
  3339. if info.get("type") == "MultiIndex" and data_columns:
  3340. raise ValueError(
  3341. f"cannot use a multi-index on axis [{axis}] with "
  3342. f"data_columns {data_columns}"
  3343. )
  3344. # evaluate the passed data_columns, True == use all columns
  3345. # take only valid axis labels
  3346. if data_columns is True:
  3347. data_columns = list(axis_labels)
  3348. elif data_columns is None:
  3349. data_columns = []
  3350. # if min_itemsize is a dict, add the keys (exclude 'values')
  3351. if isinstance(min_itemsize, dict):
  3352. existing_data_columns = set(data_columns)
  3353. data_columns = list(data_columns) # ensure we do not modify
  3354. data_columns.extend(
  3355. [
  3356. k
  3357. for k in min_itemsize.keys()
  3358. if k != "values" and k not in existing_data_columns
  3359. ]
  3360. )
  3361. # return valid columns in the order of our axis
  3362. return [c for c in data_columns if c in axis_labels]
  3363. def _create_axes(
  3364. self,
  3365. axes,
  3366. obj: DataFrame,
  3367. validate: bool = True,
  3368. nan_rep=None,
  3369. data_columns=None,
  3370. min_itemsize=None,
  3371. ):
  3372. """
  3373. Create and return the axes.
  3374. Parameters
  3375. ----------
  3376. axes: list or None
  3377. The names or numbers of the axes to create.
  3378. obj : DataFrame
  3379. The object to create axes on.
  3380. validate: bool, default True
  3381. Whether to validate the obj against an existing object already written.
  3382. nan_rep :
  3383. A value to use for string column nan_rep.
  3384. data_columns : List[str], True, or None, default None
  3385. Specify the columns that we want to create to allow indexing on.
  3386. * True : Use all available columns.
  3387. * None : Use no columns.
  3388. * List[str] : Use the specified columns.
  3389. min_itemsize: Dict[str, int] or None, default None
  3390. The min itemsize for a column in bytes.
  3391. """
  3392. if not isinstance(obj, DataFrame):
  3393. group = self.group._v_name
  3394. raise TypeError(
  3395. f"cannot properly create the storer for: [group->{group},"
  3396. f"value->{type(obj)}]"
  3397. )
  3398. # set the default axes if needed
  3399. if axes is None:
  3400. axes = [0]
  3401. # map axes to numbers
  3402. axes = [obj._get_axis_number(a) for a in axes]
  3403. # do we have an existing table (if so, use its axes & data_columns)
  3404. if self.infer_axes():
  3405. table_exists = True
  3406. axes = [a.axis for a in self.index_axes]
  3407. data_columns = list(self.data_columns)
  3408. nan_rep = self.nan_rep
  3409. # TODO: do we always have validate=True here?
  3410. else:
  3411. table_exists = False
  3412. new_info = self.info
  3413. assert self.ndim == 2 # with next check, we must have len(axes) == 1
  3414. # currently support on ndim-1 axes
  3415. if len(axes) != self.ndim - 1:
  3416. raise ValueError(
  3417. "currently only support ndim-1 indexers in an AppendableTable"
  3418. )
  3419. # create according to the new data
  3420. new_non_index_axes: list = []
  3421. # nan_representation
  3422. if nan_rep is None:
  3423. nan_rep = "nan"
  3424. # We construct the non-index-axis first, since that alters new_info
  3425. idx = next(x for x in [0, 1] if x not in axes)
  3426. a = obj.axes[idx]
  3427. # we might be able to change the axes on the appending data if necessary
  3428. append_axis = list(a)
  3429. if table_exists:
  3430. indexer = len(new_non_index_axes) # i.e. 0
  3431. exist_axis = self.non_index_axes[indexer][1]
  3432. if not array_equivalent(
  3433. np.array(append_axis),
  3434. np.array(exist_axis),
  3435. strict_nan=True,
  3436. dtype_equal=True,
  3437. ):
  3438. # ahah! -> reindex
  3439. if array_equivalent(
  3440. np.array(sorted(append_axis)),
  3441. np.array(sorted(exist_axis)),
  3442. strict_nan=True,
  3443. dtype_equal=True,
  3444. ):
  3445. append_axis = exist_axis
  3446. # the non_index_axes info
  3447. info = new_info.setdefault(idx, {})
  3448. info["names"] = list(a.names)
  3449. info["type"] = type(a).__name__
  3450. new_non_index_axes.append((idx, append_axis))
  3451. # Now we can construct our new index axis
  3452. idx = axes[0]
  3453. a = obj.axes[idx]
  3454. axis_name = obj._get_axis_name(idx)
  3455. new_index = _convert_index(axis_name, a, self.encoding, self.errors)
  3456. new_index.axis = idx
  3457. # Because we are always 2D, there is only one new_index, so
  3458. # we know it will have pos=0
  3459. new_index.set_pos(0)
  3460. new_index.update_info(new_info)
  3461. new_index.maybe_set_size(min_itemsize) # check for column conflicts
  3462. new_index_axes = [new_index]
  3463. j = len(new_index_axes) # i.e. 1
  3464. assert j == 1
  3465. # reindex by our non_index_axes & compute data_columns
  3466. assert len(new_non_index_axes) == 1
  3467. for a in new_non_index_axes:
  3468. obj = _reindex_axis(obj, a[0], a[1])
  3469. transposed = new_index.axis == 1
  3470. # figure out data_columns and get out blocks
  3471. data_columns = self.validate_data_columns(
  3472. data_columns, min_itemsize, new_non_index_axes
  3473. )
  3474. frame = self.get_object(obj, transposed)._consolidate()
  3475. blocks, blk_items = self._get_blocks_and_items(
  3476. frame, table_exists, new_non_index_axes, self.values_axes, data_columns
  3477. )
  3478. # add my values
  3479. vaxes = []
  3480. for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
  3481. # shape of the data column are the indexable axes
  3482. klass = DataCol
  3483. name = None
  3484. # we have a data_column
  3485. if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
  3486. klass = DataIndexableCol
  3487. name = b_items[0]
  3488. if not (name is None or isinstance(name, str)):
  3489. # TODO: should the message here be more specifically non-str?
  3490. raise ValueError("cannot have non-object label DataIndexableCol")
  3491. # make sure that we match up the existing columns
  3492. # if we have an existing table
  3493. existing_col: DataCol | None
  3494. if table_exists and validate:
  3495. try:
  3496. existing_col = self.values_axes[i]
  3497. except (IndexError, KeyError) as err:
  3498. raise ValueError(
  3499. f"Incompatible appended table [{blocks}]"
  3500. f"with existing table [{self.values_axes}]"
  3501. ) from err
  3502. else:
  3503. existing_col = None
  3504. new_name = name or f"values_block_{i}"
  3505. data_converted = _maybe_convert_for_string_atom(
  3506. new_name,
  3507. blk.values,
  3508. existing_col=existing_col,
  3509. min_itemsize=min_itemsize,
  3510. nan_rep=nan_rep,
  3511. encoding=self.encoding,
  3512. errors=self.errors,
  3513. columns=b_items,
  3514. )
  3515. adj_name = _maybe_adjust_name(new_name, self.version)
  3516. typ = klass._get_atom(data_converted)
  3517. kind = _dtype_to_kind(data_converted.dtype.name)
  3518. tz = None
  3519. if getattr(data_converted, "tz", None) is not None:
  3520. tz = _get_tz(data_converted.tz)
  3521. meta = metadata = ordered = None
  3522. if isinstance(data_converted.dtype, CategoricalDtype):
  3523. ordered = data_converted.ordered
  3524. meta = "category"
  3525. metadata = np.asarray(data_converted.categories).ravel()
  3526. elif isinstance(blk.dtype, StringDtype):
  3527. meta = str(blk.dtype)
  3528. data, dtype_name = _get_data_and_dtype_name(data_converted)
  3529. col = klass(
  3530. name=adj_name,
  3531. cname=new_name,
  3532. values=list(b_items),
  3533. typ=typ,
  3534. pos=j,
  3535. kind=kind,
  3536. tz=tz,
  3537. ordered=ordered,
  3538. meta=meta,
  3539. metadata=metadata,
  3540. dtype=dtype_name,
  3541. data=data,
  3542. )
  3543. col.update_info(new_info)
  3544. vaxes.append(col)
  3545. j += 1
  3546. dcs = [col.name for col in vaxes if col.is_data_indexable]
  3547. new_table = type(self)(
  3548. parent=self.parent,
  3549. group=self.group,
  3550. encoding=self.encoding,
  3551. errors=self.errors,
  3552. index_axes=new_index_axes,
  3553. non_index_axes=new_non_index_axes,
  3554. values_axes=vaxes,
  3555. data_columns=dcs,
  3556. info=new_info,
  3557. nan_rep=nan_rep,
  3558. )
  3559. if hasattr(self, "levels"):
  3560. # TODO: get this into constructor, only for appropriate subclass
  3561. new_table.levels = self.levels
  3562. new_table.validate_min_itemsize(min_itemsize)
  3563. if validate and table_exists:
  3564. new_table.validate(self)
  3565. return new_table
  3566. @staticmethod
  3567. def _get_blocks_and_items(
  3568. frame: DataFrame,
  3569. table_exists: bool,
  3570. new_non_index_axes,
  3571. values_axes,
  3572. data_columns,
  3573. ):
  3574. # Helper to clarify non-state-altering parts of _create_axes
  3575. # TODO(ArrayManager) HDFStore relies on accessing the blocks
  3576. if isinstance(frame._mgr, ArrayManager):
  3577. frame = frame._as_manager("block")
  3578. def get_blk_items(mgr):
  3579. return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
  3580. mgr = frame._mgr
  3581. mgr = cast(BlockManager, mgr)
  3582. blocks: list[Block] = list(mgr.blocks)
  3583. blk_items: list[Index] = get_blk_items(mgr)
  3584. if len(data_columns):
  3585. # TODO: prove that we only get here with axis == 1?
  3586. # It is the case in all extant tests, but NOT the case
  3587. # outside this `if len(data_columns)` check.
  3588. axis, axis_labels = new_non_index_axes[0]
  3589. new_labels = Index(axis_labels).difference(Index(data_columns))
  3590. mgr = frame.reindex(new_labels, axis=axis)._mgr
  3591. mgr = cast(BlockManager, mgr)
  3592. blocks = list(mgr.blocks)
  3593. blk_items = get_blk_items(mgr)
  3594. for c in data_columns:
  3595. # This reindex would raise ValueError if we had a duplicate
  3596. # index, so we can infer that (as long as axis==1) we
  3597. # get a single column back, so a single block.
  3598. mgr = frame.reindex([c], axis=axis)._mgr
  3599. mgr = cast(BlockManager, mgr)
  3600. blocks.extend(mgr.blocks)
  3601. blk_items.extend(get_blk_items(mgr))
  3602. # reorder the blocks in the same order as the existing table if we can
  3603. if table_exists:
  3604. by_items = {
  3605. tuple(b_items.tolist()): (b, b_items)
  3606. for b, b_items in zip(blocks, blk_items)
  3607. }
  3608. new_blocks: list[Block] = []
  3609. new_blk_items = []
  3610. for ea in values_axes:
  3611. items = tuple(ea.values)
  3612. try:
  3613. b, b_items = by_items.pop(items)
  3614. new_blocks.append(b)
  3615. new_blk_items.append(b_items)
  3616. except (IndexError, KeyError) as err:
  3617. jitems = ",".join([pprint_thing(item) for item in items])
  3618. raise ValueError(
  3619. f"cannot match existing table structure for [{jitems}] "
  3620. "on appending data"
  3621. ) from err
  3622. blocks = new_blocks
  3623. blk_items = new_blk_items
  3624. return blocks, blk_items
  3625. def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
  3626. """process axes filters"""
  3627. # make a copy to avoid side effects
  3628. if columns is not None:
  3629. columns = list(columns)
  3630. # make sure to include levels if we have them
  3631. if columns is not None and self.is_multi_index:
  3632. assert isinstance(self.levels, list) # assured by is_multi_index
  3633. for n in self.levels:
  3634. if n not in columns:
  3635. columns.insert(0, n)
  3636. # reorder by any non_index_axes & limit to the select columns
  3637. for axis, labels in self.non_index_axes:
  3638. obj = _reindex_axis(obj, axis, labels, columns)
  3639. def process_filter(field, filt, op):
  3640. for axis_name in obj._AXIS_ORDERS:
  3641. axis_number = obj._get_axis_number(axis_name)
  3642. axis_values = obj._get_axis(axis_name)
  3643. assert axis_number is not None
  3644. # see if the field is the name of an axis
  3645. if field == axis_name:
  3646. # if we have a multi-index, then need to include
  3647. # the levels
  3648. if self.is_multi_index:
  3649. filt = filt.union(Index(self.levels))
  3650. takers = op(axis_values, filt)
  3651. return obj.loc(axis=axis_number)[takers]
  3652. # this might be the name of a file IN an axis
  3653. elif field in axis_values:
  3654. # we need to filter on this dimension
  3655. values = ensure_index(getattr(obj, field).values)
  3656. filt = ensure_index(filt)
  3657. # hack until we support reversed dim flags
  3658. if isinstance(obj, DataFrame):
  3659. axis_number = 1 - axis_number
  3660. takers = op(values, filt)
  3661. return obj.loc(axis=axis_number)[takers]
  3662. raise ValueError(f"cannot find the field [{field}] for filtering!")
  3663. # apply the selection filters (but keep in the same order)
  3664. if selection.filter is not None:
  3665. for field, op, filt in selection.filter.format():
  3666. obj = process_filter(field, filt, op)
  3667. return obj
  3668. def create_description(
  3669. self,
  3670. complib,
  3671. complevel: int | None,
  3672. fletcher32: bool,
  3673. expectedrows: int | None,
  3674. ) -> dict[str, Any]:
  3675. """create the description of the table from the axes & values"""
  3676. # provided expected rows if its passed
  3677. if expectedrows is None:
  3678. expectedrows = max(self.nrows_expected, 10000)
  3679. d = {"name": "table", "expectedrows": expectedrows}
  3680. # description from the axes & values
  3681. d["description"] = {a.cname: a.typ for a in self.axes}
  3682. if complib:
  3683. if complevel is None:
  3684. complevel = self._complevel or 9
  3685. filters = _tables().Filters(
  3686. complevel=complevel,
  3687. complib=complib,
  3688. fletcher32=fletcher32 or self._fletcher32,
  3689. )
  3690. d["filters"] = filters
  3691. elif self._filters is not None:
  3692. d["filters"] = self._filters
  3693. return d
  3694. def read_coordinates(
  3695. self, where=None, start: int | None = None, stop: int | None = None
  3696. ):
  3697. """
  3698. select coordinates (row numbers) from a table; return the
  3699. coordinates object
  3700. """
  3701. # validate the version
  3702. self.validate_version(where)
  3703. # infer the data kind
  3704. if not self.infer_axes():
  3705. return False
  3706. # create the selection
  3707. selection = Selection(self, where=where, start=start, stop=stop)
  3708. coords = selection.select_coords()
  3709. if selection.filter is not None:
  3710. for field, op, filt in selection.filter.format():
  3711. data = self.read_column(
  3712. field, start=coords.min(), stop=coords.max() + 1
  3713. )
  3714. coords = coords[op(data.iloc[coords - coords.min()], filt).values]
  3715. return Index(coords)
  3716. def read_column(
  3717. self,
  3718. column: str,
  3719. where=None,
  3720. start: int | None = None,
  3721. stop: int | None = None,
  3722. ):
  3723. """
  3724. return a single column from the table, generally only indexables
  3725. are interesting
  3726. """
  3727. # validate the version
  3728. self.validate_version()
  3729. # infer the data kind
  3730. if not self.infer_axes():
  3731. return False
  3732. if where is not None:
  3733. raise TypeError("read_column does not currently accept a where clause")
  3734. # find the axes
  3735. for a in self.axes:
  3736. if column == a.name:
  3737. if not a.is_data_indexable:
  3738. raise ValueError(
  3739. f"column [{column}] can not be extracted individually; "
  3740. "it is not data indexable"
  3741. )
  3742. # column must be an indexable or a data column
  3743. c = getattr(self.table.cols, column)
  3744. a.set_info(self.info)
  3745. col_values = a.convert(
  3746. c[start:stop],
  3747. nan_rep=self.nan_rep,
  3748. encoding=self.encoding,
  3749. errors=self.errors,
  3750. )
  3751. cvs = _set_tz(col_values[1], a.tz)
  3752. dtype = getattr(self.table.attrs, f"{column}_meta", None)
  3753. return Series(cvs, name=column, copy=False, dtype=dtype)
  3754. raise KeyError(f"column [{column}] not found in the table")
  3755. class WORMTable(Table):
  3756. """
  3757. a write-once read-many table: this format DOES NOT ALLOW appending to a
  3758. table. writing is a one-time operation the data are stored in a format
  3759. that allows for searching the data on disk
  3760. """
  3761. table_type = "worm"
  3762. def read(
  3763. self,
  3764. where=None,
  3765. columns=None,
  3766. start: int | None = None,
  3767. stop: int | None = None,
  3768. ):
  3769. """
  3770. read the indices and the indexing array, calculate offset rows and return
  3771. """
  3772. raise NotImplementedError("WORMTable needs to implement read")
  3773. def write(self, obj, **kwargs) -> None:
  3774. """
  3775. write in a format that we can search later on (but cannot append
  3776. to): write out the indices and the values using _write_array
  3777. (e.g. a CArray) create an indexing table so that we can search
  3778. """
  3779. raise NotImplementedError("WORMTable needs to implement write")
  3780. class AppendableTable(Table):
  3781. """support the new appendable table formats"""
  3782. table_type = "appendable"
  3783. # error: Signature of "write" incompatible with supertype "Fixed"
  3784. def write( # type: ignore[override]
  3785. self,
  3786. obj,
  3787. axes=None,
  3788. append: bool = False,
  3789. complib=None,
  3790. complevel=None,
  3791. fletcher32=None,
  3792. min_itemsize=None,
  3793. chunksize: int | None = None,
  3794. expectedrows=None,
  3795. dropna: bool = False,
  3796. nan_rep=None,
  3797. data_columns=None,
  3798. track_times: bool = True,
  3799. ) -> None:
  3800. if not append and self.is_exists:
  3801. self._handle.remove_node(self.group, "table")
  3802. # create the axes
  3803. table = self._create_axes(
  3804. axes=axes,
  3805. obj=obj,
  3806. validate=append,
  3807. min_itemsize=min_itemsize,
  3808. nan_rep=nan_rep,
  3809. data_columns=data_columns,
  3810. )
  3811. for a in table.axes:
  3812. a.validate_names()
  3813. if not table.is_exists:
  3814. # create the table
  3815. options = table.create_description(
  3816. complib=complib,
  3817. complevel=complevel,
  3818. fletcher32=fletcher32,
  3819. expectedrows=expectedrows,
  3820. )
  3821. # set the table attributes
  3822. table.set_attrs()
  3823. options["track_times"] = track_times
  3824. # create the table
  3825. table._handle.create_table(table.group, **options)
  3826. # update my info
  3827. table.attrs.info = table.info
  3828. # validate the axes and set the kinds
  3829. for a in table.axes:
  3830. a.validate_and_set(table, append)
  3831. # add the rows
  3832. table.write_data(chunksize, dropna=dropna)
  3833. def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
  3834. """
  3835. we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
  3836. """
  3837. names = self.dtype.names
  3838. nrows = self.nrows_expected
  3839. # if dropna==True, then drop ALL nan rows
  3840. masks = []
  3841. if dropna:
  3842. for a in self.values_axes:
  3843. # figure the mask: only do if we can successfully process this
  3844. # column, otherwise ignore the mask
  3845. mask = isna(a.data).all(axis=0)
  3846. if isinstance(mask, np.ndarray):
  3847. masks.append(mask.astype("u1", copy=False))
  3848. # consolidate masks
  3849. if len(masks):
  3850. mask = masks[0]
  3851. for m in masks[1:]:
  3852. mask = mask & m
  3853. mask = mask.ravel()
  3854. else:
  3855. mask = None
  3856. # broadcast the indexes if needed
  3857. indexes = [a.cvalues for a in self.index_axes]
  3858. nindexes = len(indexes)
  3859. assert nindexes == 1, nindexes # ensures we dont need to broadcast
  3860. # transpose the values so first dimension is last
  3861. # reshape the values if needed
  3862. values = [a.take_data() for a in self.values_axes]
  3863. values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
  3864. bvalues = []
  3865. for i, v in enumerate(values):
  3866. new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
  3867. bvalues.append(v.reshape(new_shape))
  3868. # write the chunks
  3869. if chunksize is None:
  3870. chunksize = 100000
  3871. rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
  3872. chunks = nrows // chunksize + 1
  3873. for i in range(chunks):
  3874. start_i = i * chunksize
  3875. end_i = min((i + 1) * chunksize, nrows)
  3876. if start_i >= end_i:
  3877. break
  3878. self.write_data_chunk(
  3879. rows,
  3880. indexes=[a[start_i:end_i] for a in indexes],
  3881. mask=mask[start_i:end_i] if mask is not None else None,
  3882. values=[v[start_i:end_i] for v in bvalues],
  3883. )
  3884. def write_data_chunk(
  3885. self,
  3886. rows: np.ndarray,
  3887. indexes: list[np.ndarray],
  3888. mask: npt.NDArray[np.bool_] | None,
  3889. values: list[np.ndarray],
  3890. ) -> None:
  3891. """
  3892. Parameters
  3893. ----------
  3894. rows : an empty memory space where we are putting the chunk
  3895. indexes : an array of the indexes
  3896. mask : an array of the masks
  3897. values : an array of the values
  3898. """
  3899. # 0 len
  3900. for v in values:
  3901. if not np.prod(v.shape):
  3902. return
  3903. nrows = indexes[0].shape[0]
  3904. if nrows != len(rows):
  3905. rows = np.empty(nrows, dtype=self.dtype)
  3906. names = self.dtype.names
  3907. nindexes = len(indexes)
  3908. # indexes
  3909. for i, idx in enumerate(indexes):
  3910. rows[names[i]] = idx
  3911. # values
  3912. for i, v in enumerate(values):
  3913. rows[names[i + nindexes]] = v
  3914. # mask
  3915. if mask is not None:
  3916. m = ~mask.ravel().astype(bool, copy=False)
  3917. if not m.all():
  3918. rows = rows[m]
  3919. if len(rows):
  3920. self.table.append(rows)
  3921. self.table.flush()
  3922. def delete(self, where=None, start: int | None = None, stop: int | None = None):
  3923. # delete all rows (and return the nrows)
  3924. if where is None or not len(where):
  3925. if start is None and stop is None:
  3926. nrows = self.nrows
  3927. self._handle.remove_node(self.group, recursive=True)
  3928. else:
  3929. # pytables<3.0 would remove a single row with stop=None
  3930. if stop is None:
  3931. stop = self.nrows
  3932. nrows = self.table.remove_rows(start=start, stop=stop)
  3933. self.table.flush()
  3934. return nrows
  3935. # infer the data kind
  3936. if not self.infer_axes():
  3937. return None
  3938. # create the selection
  3939. table = self.table
  3940. selection = Selection(self, where, start=start, stop=stop)
  3941. values = selection.select_coords()
  3942. # delete the rows in reverse order
  3943. sorted_series = Series(values, copy=False).sort_values()
  3944. ln = len(sorted_series)
  3945. if ln:
  3946. # construct groups of consecutive rows
  3947. diff = sorted_series.diff()
  3948. groups = list(diff[diff > 1].index)
  3949. # 1 group
  3950. if not len(groups):
  3951. groups = [0]
  3952. # final element
  3953. if groups[-1] != ln:
  3954. groups.append(ln)
  3955. # initial element
  3956. if groups[0] != 0:
  3957. groups.insert(0, 0)
  3958. # we must remove in reverse order!
  3959. pg = groups.pop()
  3960. for g in reversed(groups):
  3961. rows = sorted_series.take(range(g, pg))
  3962. table.remove_rows(
  3963. start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
  3964. )
  3965. pg = g
  3966. self.table.flush()
  3967. # return the number of rows removed
  3968. return ln
  3969. class AppendableFrameTable(AppendableTable):
  3970. """support the new appendable table formats"""
  3971. pandas_kind = "frame_table"
  3972. table_type = "appendable_frame"
  3973. ndim = 2
  3974. obj_type: type[DataFrame | Series] = DataFrame
  3975. @property
  3976. def is_transposed(self) -> bool:
  3977. return self.index_axes[0].axis == 1
  3978. @classmethod
  3979. def get_object(cls, obj, transposed: bool):
  3980. """these are written transposed"""
  3981. if transposed:
  3982. obj = obj.T
  3983. return obj
  3984. def read(
  3985. self,
  3986. where=None,
  3987. columns=None,
  3988. start: int | None = None,
  3989. stop: int | None = None,
  3990. ):
  3991. # validate the version
  3992. self.validate_version(where)
  3993. # infer the data kind
  3994. if not self.infer_axes():
  3995. return None
  3996. result = self._read_axes(where=where, start=start, stop=stop)
  3997. info = (
  3998. self.info.get(self.non_index_axes[0][0], {})
  3999. if len(self.non_index_axes)
  4000. else {}
  4001. )
  4002. inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
  4003. assert len(inds) == 1
  4004. ind = inds[0]
  4005. index = result[ind][0]
  4006. frames = []
  4007. for i, a in enumerate(self.axes):
  4008. if a not in self.values_axes:
  4009. continue
  4010. index_vals, cvalues = result[i]
  4011. # we could have a multi-index constructor here
  4012. # ensure_index doesn't recognized our list-of-tuples here
  4013. if info.get("type") != "MultiIndex":
  4014. cols = Index(index_vals)
  4015. else:
  4016. cols = MultiIndex.from_tuples(index_vals)
  4017. names = info.get("names")
  4018. if names is not None:
  4019. cols.set_names(names, inplace=True)
  4020. if self.is_transposed:
  4021. values = cvalues
  4022. index_ = cols
  4023. cols_ = Index(index, name=getattr(index, "name", None))
  4024. else:
  4025. values = cvalues.T
  4026. index_ = Index(index, name=getattr(index, "name", None))
  4027. cols_ = cols
  4028. # if we have a DataIndexableCol, its shape will only be 1 dim
  4029. if values.ndim == 1 and isinstance(values, np.ndarray):
  4030. values = values.reshape((1, values.shape[0]))
  4031. if isinstance(values, np.ndarray):
  4032. try:
  4033. df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
  4034. except UnicodeEncodeError as err:
  4035. if (
  4036. self.errors == "surrogatepass"
  4037. and get_option("future.infer_string")
  4038. and str(err).endswith("surrogates not allowed")
  4039. and HAS_PYARROW
  4040. ):
  4041. df = DataFrame(
  4042. values.T,
  4043. columns=cols_,
  4044. index=index_,
  4045. copy=False,
  4046. dtype=StringDtype(storage="python", na_value=np.nan),
  4047. )
  4048. else:
  4049. raise
  4050. elif isinstance(values, Index):
  4051. df = DataFrame(values, columns=cols_, index=index_)
  4052. else:
  4053. # Categorical
  4054. df = DataFrame._from_arrays([values], columns=cols_, index=index_)
  4055. if not (using_string_dtype() and values.dtype.kind == "O"):
  4056. assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
  4057. # If str / string dtype is stored in meta, use that.
  4058. for column in cols_:
  4059. dtype = getattr(self.table.attrs, f"{column}_meta", None)
  4060. if dtype in ["str", "string"]:
  4061. df[column] = df[column].astype(dtype)
  4062. frames.append(df)
  4063. if len(frames) == 1:
  4064. df = frames[0]
  4065. else:
  4066. df = concat(frames, axis=1)
  4067. selection = Selection(self, where=where, start=start, stop=stop)
  4068. # apply the selection filters & axis orderings
  4069. df = self.process_axes(df, selection=selection, columns=columns)
  4070. return df
  4071. class AppendableSeriesTable(AppendableFrameTable):
  4072. """support the new appendable table formats"""
  4073. pandas_kind = "series_table"
  4074. table_type = "appendable_series"
  4075. ndim = 2
  4076. obj_type = Series
  4077. @property
  4078. def is_transposed(self) -> bool:
  4079. return False
  4080. @classmethod
  4081. def get_object(cls, obj, transposed: bool):
  4082. return obj
  4083. # error: Signature of "write" incompatible with supertype "Fixed"
  4084. def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]
  4085. """we are going to write this as a frame table"""
  4086. if not isinstance(obj, DataFrame):
  4087. name = obj.name or "values"
  4088. obj = obj.to_frame(name)
  4089. super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
  4090. def read(
  4091. self,
  4092. where=None,
  4093. columns=None,
  4094. start: int | None = None,
  4095. stop: int | None = None,
  4096. ) -> Series:
  4097. is_multi_index = self.is_multi_index
  4098. if columns is not None and is_multi_index:
  4099. assert isinstance(self.levels, list) # needed for mypy
  4100. for n in self.levels:
  4101. if n not in columns:
  4102. columns.insert(0, n)
  4103. s = super().read(where=where, columns=columns, start=start, stop=stop)
  4104. if is_multi_index:
  4105. s.set_index(self.levels, inplace=True)
  4106. s = s.iloc[:, 0]
  4107. # remove the default name
  4108. if s.name == "values":
  4109. s.name = None
  4110. return s
  4111. class AppendableMultiSeriesTable(AppendableSeriesTable):
  4112. """support the new appendable table formats"""
  4113. pandas_kind = "series_table"
  4114. table_type = "appendable_multiseries"
  4115. # error: Signature of "write" incompatible with supertype "Fixed"
  4116. def write(self, obj, **kwargs) -> None: # type: ignore[override]
  4117. """we are going to write this as a frame table"""
  4118. name = obj.name or "values"
  4119. newobj, self.levels = self.validate_multiindex(obj)
  4120. assert isinstance(self.levels, list) # for mypy
  4121. cols = list(self.levels)
  4122. cols.append(name)
  4123. newobj.columns = Index(cols)
  4124. super().write(obj=newobj, **kwargs)
  4125. class GenericTable(AppendableFrameTable):
  4126. """a table that read/writes the generic pytables table format"""
  4127. pandas_kind = "frame_table"
  4128. table_type = "generic_table"
  4129. ndim = 2
  4130. obj_type = DataFrame
  4131. levels: list[Hashable]
  4132. @property
  4133. def pandas_type(self) -> str:
  4134. return self.pandas_kind
  4135. @property
  4136. def storable(self):
  4137. return getattr(self.group, "table", None) or self.group
  4138. def get_attrs(self) -> None:
  4139. """retrieve our attributes"""
  4140. self.non_index_axes = []
  4141. self.nan_rep = None
  4142. self.levels = []
  4143. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  4144. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  4145. self.data_columns = [a.name for a in self.values_axes]
  4146. @cache_readonly
  4147. def indexables(self):
  4148. """create the indexables from the table description"""
  4149. d = self.description
  4150. # TODO: can we get a typ for this? AFAICT it is the only place
  4151. # where we aren't passing one
  4152. # the index columns is just a simple index
  4153. md = self.read_metadata("index")
  4154. meta = "category" if md is not None else None
  4155. index_col = GenericIndexCol(
  4156. name="index", axis=0, table=self.table, meta=meta, metadata=md
  4157. )
  4158. _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
  4159. for i, n in enumerate(d._v_names):
  4160. assert isinstance(n, str)
  4161. atom = getattr(d, n)
  4162. md = self.read_metadata(n)
  4163. meta = "category" if md is not None else None
  4164. dc = GenericDataIndexableCol(
  4165. name=n,
  4166. pos=i,
  4167. values=[n],
  4168. typ=atom,
  4169. table=self.table,
  4170. meta=meta,
  4171. metadata=md,
  4172. )
  4173. _indexables.append(dc)
  4174. return _indexables
  4175. # error: Signature of "write" incompatible with supertype "AppendableTable"
  4176. def write(self, **kwargs) -> None: # type: ignore[override]
  4177. raise NotImplementedError("cannot write on an generic table")
  4178. class AppendableMultiFrameTable(AppendableFrameTable):
  4179. """a frame with a multi-index"""
  4180. table_type = "appendable_multiframe"
  4181. obj_type = DataFrame
  4182. ndim = 2
  4183. _re_levels = re.compile(r"^level_\d+$")
  4184. @property
  4185. def table_type_short(self) -> str:
  4186. return "appendable_multi"
  4187. # error: Signature of "write" incompatible with supertype "Fixed"
  4188. def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]
  4189. if data_columns is None:
  4190. data_columns = []
  4191. elif data_columns is True:
  4192. data_columns = obj.columns.tolist()
  4193. obj, self.levels = self.validate_multiindex(obj)
  4194. assert isinstance(self.levels, list) # for mypy
  4195. for n in self.levels:
  4196. if n not in data_columns:
  4197. data_columns.insert(0, n)
  4198. super().write(obj=obj, data_columns=data_columns, **kwargs)
  4199. def read(
  4200. self,
  4201. where=None,
  4202. columns=None,
  4203. start: int | None = None,
  4204. stop: int | None = None,
  4205. ):
  4206. df = super().read(where=where, columns=columns, start=start, stop=stop)
  4207. df = df.set_index(self.levels)
  4208. # remove names for 'level_%d'
  4209. df.index = df.index.set_names(
  4210. [None if self._re_levels.search(name) else name for name in df.index.names]
  4211. )
  4212. return df
  4213. def _reindex_axis(
  4214. obj: DataFrame, axis: AxisInt, labels: Index, other=None
  4215. ) -> DataFrame:
  4216. ax = obj._get_axis(axis)
  4217. labels = ensure_index(labels)
  4218. # try not to reindex even if other is provided
  4219. # if it equals our current index
  4220. if other is not None:
  4221. other = ensure_index(other)
  4222. if (other is None or labels.equals(other)) and labels.equals(ax):
  4223. return obj
  4224. labels = ensure_index(labels.unique())
  4225. if other is not None:
  4226. labels = ensure_index(other.unique()).intersection(labels, sort=False)
  4227. if not labels.equals(ax):
  4228. slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
  4229. slicer[axis] = labels
  4230. obj = obj.loc[tuple(slicer)]
  4231. return obj
  4232. # tz to/from coercion
  4233. def _get_tz(tz: tzinfo) -> str | tzinfo:
  4234. """for a tz-aware type, return an encoded zone"""
  4235. zone = timezones.get_timezone(tz)
  4236. return zone
  4237. @overload
  4238. def _set_tz(
  4239. values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
  4240. ) -> DatetimeIndex:
  4241. ...
  4242. @overload
  4243. def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
  4244. ...
  4245. def _set_tz(
  4246. values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
  4247. ) -> np.ndarray | DatetimeIndex:
  4248. """
  4249. coerce the values to a DatetimeIndex if tz is set
  4250. preserve the input shape if possible
  4251. Parameters
  4252. ----------
  4253. values : ndarray or Index
  4254. tz : str or tzinfo
  4255. coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
  4256. """
  4257. if isinstance(values, DatetimeIndex):
  4258. # If values is tzaware, the tz gets dropped in the values.ravel()
  4259. # call below (which returns an ndarray). So we are only non-lossy
  4260. # if `tz` matches `values.tz`.
  4261. assert values.tz is None or values.tz == tz
  4262. if values.tz is not None:
  4263. return values
  4264. if tz is not None:
  4265. if isinstance(values, DatetimeIndex):
  4266. name = values.name
  4267. else:
  4268. name = None
  4269. values = values.ravel()
  4270. tz = _ensure_decoded(tz)
  4271. values = DatetimeIndex(values, name=name)
  4272. values = values.tz_localize("UTC").tz_convert(tz)
  4273. elif coerce:
  4274. values = np.asarray(values, dtype="M8[ns]")
  4275. # error: Incompatible return value type (got "Union[ndarray, Index]",
  4276. # expected "Union[ndarray, DatetimeIndex]")
  4277. return values # type: ignore[return-value]
  4278. def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
  4279. assert isinstance(name, str)
  4280. index_name = index.name
  4281. # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
  4282. # expected "Union[ExtensionArray, ndarray]"
  4283. converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
  4284. kind = _dtype_to_kind(dtype_name)
  4285. atom = DataIndexableCol._get_atom(converted)
  4286. if (
  4287. lib.is_np_dtype(index.dtype, "iu")
  4288. or needs_i8_conversion(index.dtype)
  4289. or is_bool_dtype(index.dtype)
  4290. ):
  4291. # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
  4292. # in which case "kind" is "integer", "integer", "datetime64",
  4293. # "timedelta64", and "integer", respectively.
  4294. return IndexCol(
  4295. name,
  4296. values=converted,
  4297. kind=kind,
  4298. typ=atom,
  4299. freq=getattr(index, "freq", None),
  4300. tz=getattr(index, "tz", None),
  4301. index_name=index_name,
  4302. )
  4303. if isinstance(index, MultiIndex):
  4304. raise TypeError("MultiIndex not supported here!")
  4305. inferred_type = lib.infer_dtype(index, skipna=False)
  4306. # we won't get inferred_type of "datetime64" or "timedelta64" as these
  4307. # would go through the DatetimeIndex/TimedeltaIndex paths above
  4308. values = np.asarray(index)
  4309. if inferred_type == "date":
  4310. converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
  4311. return IndexCol(
  4312. name, converted, "date", _tables().Time32Col(), index_name=index_name
  4313. )
  4314. elif inferred_type == "string":
  4315. converted = _convert_string_array(values, encoding, errors)
  4316. itemsize = converted.dtype.itemsize
  4317. return IndexCol(
  4318. name,
  4319. converted,
  4320. "string",
  4321. _tables().StringCol(itemsize),
  4322. index_name=index_name,
  4323. )
  4324. elif inferred_type in ["integer", "floating"]:
  4325. return IndexCol(
  4326. name, values=converted, kind=kind, typ=atom, index_name=index_name
  4327. )
  4328. else:
  4329. assert isinstance(converted, np.ndarray) and converted.dtype == object
  4330. assert kind == "object", kind
  4331. atom = _tables().ObjectAtom()
  4332. return IndexCol(name, converted, kind, atom, index_name=index_name)
  4333. def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
  4334. index: Index | np.ndarray
  4335. if kind.startswith("datetime64"):
  4336. if kind == "datetime64":
  4337. # created before we stored resolution information
  4338. index = DatetimeIndex(data)
  4339. else:
  4340. index = DatetimeIndex(data.view(kind))
  4341. elif kind == "timedelta64":
  4342. index = TimedeltaIndex(data)
  4343. elif kind == "date":
  4344. try:
  4345. index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
  4346. except ValueError:
  4347. index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
  4348. elif kind in ("integer", "float", "bool"):
  4349. index = np.asarray(data)
  4350. elif kind in ("string"):
  4351. index = _unconvert_string_array(
  4352. data, nan_rep=None, encoding=encoding, errors=errors
  4353. )
  4354. elif kind == "object":
  4355. index = np.asarray(data[0])
  4356. else: # pragma: no cover
  4357. raise ValueError(f"unrecognized index type {kind}")
  4358. return index
  4359. def _maybe_convert_for_string_atom(
  4360. name: str,
  4361. bvalues: ArrayLike,
  4362. existing_col,
  4363. min_itemsize,
  4364. nan_rep,
  4365. encoding,
  4366. errors,
  4367. columns: list[str],
  4368. ):
  4369. if isinstance(bvalues.dtype, StringDtype):
  4370. # "ndarray[Any, Any]" has no attribute "to_numpy"
  4371. bvalues = bvalues.to_numpy() # type: ignore[union-attr]
  4372. if bvalues.dtype != object:
  4373. return bvalues
  4374. bvalues = cast(np.ndarray, bvalues)
  4375. dtype_name = bvalues.dtype.name
  4376. inferred_type = lib.infer_dtype(bvalues, skipna=False)
  4377. if inferred_type == "date":
  4378. raise TypeError("[date] is not implemented as a table column")
  4379. if inferred_type == "datetime":
  4380. # after GH#8260
  4381. # this only would be hit for a multi-timezone dtype which is an error
  4382. raise TypeError(
  4383. "too many timezones in this block, create separate data columns"
  4384. )
  4385. if not (inferred_type == "string" or dtype_name == "object"):
  4386. return bvalues
  4387. mask = isna(bvalues)
  4388. data = bvalues.copy()
  4389. data[mask] = nan_rep
  4390. if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
  4391. raise ValueError("NaN representation is too large for existing column size")
  4392. # see if we have a valid string type
  4393. inferred_type = lib.infer_dtype(data, skipna=False)
  4394. if inferred_type != "string":
  4395. # we cannot serialize this data, so report an exception on a column
  4396. # by column basis
  4397. # expected behaviour:
  4398. # search block for a non-string object column by column
  4399. for i in range(data.shape[0]):
  4400. col = data[i]
  4401. inferred_type = lib.infer_dtype(col, skipna=False)
  4402. if inferred_type != "string":
  4403. error_column_label = columns[i] if len(columns) > i else f"No.{i}"
  4404. raise TypeError(
  4405. f"Cannot serialize the column [{error_column_label}]\n"
  4406. f"because its data contents are not [string] but "
  4407. f"[{inferred_type}] object dtype"
  4408. )
  4409. # itemsize is the maximum length of a string (along any dimension)
  4410. data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
  4411. itemsize = data_converted.itemsize
  4412. # specified min_itemsize?
  4413. if isinstance(min_itemsize, dict):
  4414. min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
  4415. itemsize = max(min_itemsize or 0, itemsize)
  4416. # check for column in the values conflicts
  4417. if existing_col is not None:
  4418. eci = existing_col.validate_col(itemsize)
  4419. if eci is not None and eci > itemsize:
  4420. itemsize = eci
  4421. data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
  4422. return data_converted
  4423. def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
  4424. """
  4425. Take a string-like that is object dtype and coerce to a fixed size string type.
  4426. Parameters
  4427. ----------
  4428. data : np.ndarray[object]
  4429. encoding : str
  4430. errors : str
  4431. Handler for encoding errors.
  4432. Returns
  4433. -------
  4434. np.ndarray[fixed-length-string]
  4435. """
  4436. # encode if needed
  4437. if len(data):
  4438. data = (
  4439. Series(data.ravel(), copy=False, dtype="object")
  4440. .str.encode(encoding, errors)
  4441. ._values.reshape(data.shape)
  4442. )
  4443. # create the sized dtype
  4444. ensured = ensure_object(data.ravel())
  4445. itemsize = max(1, libwriters.max_len_string_array(ensured))
  4446. data = np.asarray(data, dtype=f"S{itemsize}")
  4447. return data
  4448. def _unconvert_string_array(
  4449. data: np.ndarray, nan_rep, encoding: str, errors: str
  4450. ) -> np.ndarray:
  4451. """
  4452. Inverse of _convert_string_array.
  4453. Parameters
  4454. ----------
  4455. data : np.ndarray[fixed-length-string]
  4456. nan_rep : the storage repr of NaN
  4457. encoding : str
  4458. errors : str
  4459. Handler for encoding errors.
  4460. Returns
  4461. -------
  4462. np.ndarray[object]
  4463. Decoded data.
  4464. """
  4465. shape = data.shape
  4466. data = np.asarray(data.ravel(), dtype=object)
  4467. if len(data):
  4468. itemsize = libwriters.max_len_string_array(ensure_object(data))
  4469. dtype = f"U{itemsize}"
  4470. if isinstance(data[0], bytes):
  4471. ser = Series(data, copy=False).str.decode(
  4472. encoding, errors=errors, dtype="object"
  4473. )
  4474. data = ser.to_numpy()
  4475. data.flags.writeable = True
  4476. else:
  4477. data = data.astype(dtype, copy=False).astype(object, copy=False)
  4478. if nan_rep is None:
  4479. nan_rep = "nan"
  4480. libwriters.string_array_replace_from_nan_rep(data, nan_rep)
  4481. return data.reshape(shape)
  4482. def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
  4483. assert isinstance(val_kind, str), type(val_kind)
  4484. if _need_convert(val_kind):
  4485. conv = _get_converter(val_kind, encoding, errors)
  4486. values = conv(values)
  4487. return values
  4488. def _get_converter(kind: str, encoding: str, errors: str):
  4489. if kind == "datetime64":
  4490. return lambda x: np.asarray(x, dtype="M8[ns]")
  4491. elif "datetime64" in kind:
  4492. return lambda x: np.asarray(x, dtype=kind)
  4493. elif kind == "string":
  4494. return lambda x: _unconvert_string_array(
  4495. x, nan_rep=None, encoding=encoding, errors=errors
  4496. )
  4497. else: # pragma: no cover
  4498. raise ValueError(f"invalid kind {kind}")
  4499. def _need_convert(kind: str) -> bool:
  4500. if kind in ("datetime64", "string") or "datetime64" in kind:
  4501. return True
  4502. return False
  4503. def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
  4504. """
  4505. Prior to 0.10.1, we named values blocks like: values_block_0 an the
  4506. name values_0, adjust the given name if necessary.
  4507. Parameters
  4508. ----------
  4509. name : str
  4510. version : Tuple[int, int, int]
  4511. Returns
  4512. -------
  4513. str
  4514. """
  4515. if isinstance(version, str) or len(version) < 3:
  4516. raise ValueError("Version is incorrect, expected sequence of 3 integers.")
  4517. if version[0] == 0 and version[1] <= 10 and version[2] == 0:
  4518. m = re.search(r"values_block_(\d+)", name)
  4519. if m:
  4520. grp = m.groups()[0]
  4521. name = f"values_{grp}"
  4522. return name
  4523. def _dtype_to_kind(dtype_str: str) -> str:
  4524. """
  4525. Find the "kind" string describing the given dtype name.
  4526. """
  4527. dtype_str = _ensure_decoded(dtype_str)
  4528. if dtype_str.startswith(("string", "bytes")):
  4529. kind = "string"
  4530. elif dtype_str.startswith("float"):
  4531. kind = "float"
  4532. elif dtype_str.startswith("complex"):
  4533. kind = "complex"
  4534. elif dtype_str.startswith(("int", "uint")):
  4535. kind = "integer"
  4536. elif dtype_str.startswith("datetime64"):
  4537. kind = dtype_str
  4538. elif dtype_str.startswith("timedelta"):
  4539. kind = "timedelta64"
  4540. elif dtype_str.startswith("bool"):
  4541. kind = "bool"
  4542. elif dtype_str.startswith("category"):
  4543. kind = "category"
  4544. elif dtype_str.startswith("period"):
  4545. # We store the `freq` attr so we can restore from integers
  4546. kind = "integer"
  4547. elif dtype_str == "object":
  4548. kind = "object"
  4549. elif dtype_str == "str":
  4550. kind = "str"
  4551. else:
  4552. raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
  4553. return kind
  4554. def _get_data_and_dtype_name(data: ArrayLike):
  4555. """
  4556. Convert the passed data into a storable form and a dtype string.
  4557. """
  4558. if isinstance(data, Categorical):
  4559. data = data.codes
  4560. if isinstance(data.dtype, DatetimeTZDtype):
  4561. # For datetime64tz we need to drop the TZ in tests TODO: why?
  4562. dtype_name = f"datetime64[{data.dtype.unit}]"
  4563. else:
  4564. dtype_name = data.dtype.name
  4565. if data.dtype.kind in "mM":
  4566. data = np.asarray(data.view("i8"))
  4567. # TODO: we used to reshape for the dt64tz case, but no longer
  4568. # doing that doesn't seem to break anything. why?
  4569. elif isinstance(data, PeriodIndex):
  4570. data = data.asi8
  4571. data = np.asarray(data)
  4572. return data, dtype_name
  4573. class Selection:
  4574. """
  4575. Carries out a selection operation on a tables.Table object.
  4576. Parameters
  4577. ----------
  4578. table : a Table object
  4579. where : list of Terms (or convertible to)
  4580. start, stop: indices to start and/or stop selection
  4581. """
  4582. def __init__(
  4583. self,
  4584. table: Table,
  4585. where=None,
  4586. start: int | None = None,
  4587. stop: int | None = None,
  4588. ) -> None:
  4589. self.table = table
  4590. self.where = where
  4591. self.start = start
  4592. self.stop = stop
  4593. self.condition = None
  4594. self.filter = None
  4595. self.terms = None
  4596. self.coordinates = None
  4597. if is_list_like(where):
  4598. # see if we have a passed coordinate like
  4599. with suppress(ValueError):
  4600. inferred = lib.infer_dtype(where, skipna=False)
  4601. if inferred in ("integer", "boolean"):
  4602. where = np.asarray(where)
  4603. if where.dtype == np.bool_:
  4604. start, stop = self.start, self.stop
  4605. if start is None:
  4606. start = 0
  4607. if stop is None:
  4608. stop = self.table.nrows
  4609. self.coordinates = np.arange(start, stop)[where]
  4610. elif issubclass(where.dtype.type, np.integer):
  4611. if (self.start is not None and (where < self.start).any()) or (
  4612. self.stop is not None and (where >= self.stop).any()
  4613. ):
  4614. raise ValueError(
  4615. "where must have index locations >= start and < stop"
  4616. )
  4617. self.coordinates = where
  4618. if self.coordinates is None:
  4619. self.terms = self.generate(where)
  4620. # create the numexpr & the filter
  4621. if self.terms is not None:
  4622. self.condition, self.filter = self.terms.evaluate()
  4623. def generate(self, where):
  4624. """where can be a : dict,list,tuple,string"""
  4625. if where is None:
  4626. return None
  4627. q = self.table.queryables()
  4628. try:
  4629. return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
  4630. except NameError as err:
  4631. # raise a nice message, suggesting that the user should use
  4632. # data_columns
  4633. qkeys = ",".join(q.keys())
  4634. msg = dedent(
  4635. f"""\
  4636. The passed where expression: {where}
  4637. contains an invalid variable reference
  4638. all of the variable references must be a reference to
  4639. an axis (e.g. 'index' or 'columns'), or a data_column
  4640. The currently defined references are: {qkeys}
  4641. """
  4642. )
  4643. raise ValueError(msg) from err
  4644. def select(self):
  4645. """
  4646. generate the selection
  4647. """
  4648. if self.condition is not None:
  4649. return self.table.table.read_where(
  4650. self.condition.format(), start=self.start, stop=self.stop
  4651. )
  4652. elif self.coordinates is not None:
  4653. return self.table.table.read_coordinates(self.coordinates)
  4654. return self.table.table.read(start=self.start, stop=self.stop)
  4655. def select_coords(self):
  4656. """
  4657. generate the selection
  4658. """
  4659. start, stop = self.start, self.stop
  4660. nrows = self.table.nrows
  4661. if start is None:
  4662. start = 0
  4663. elif start < 0:
  4664. start += nrows
  4665. if stop is None:
  4666. stop = nrows
  4667. elif stop < 0:
  4668. stop += nrows
  4669. if self.condition is not None:
  4670. return self.table.table.get_where_list(
  4671. self.condition.format(), start=start, stop=stop, sort=True
  4672. )
  4673. elif self.coordinates is not None:
  4674. return self.coordinates
  4675. return np.arange(start, stop)