stata.py 133 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768
  1. """
  2. Module contains tools for processing Stata files into DataFrames
  3. The StataReader below was originally written by Joe Presbrey as part of PyDTA.
  4. It has been extended and improved by Skipper Seabold from the Statsmodels
  5. project who also developed the StataWriter and was finally added to pandas in
  6. a once again improved version.
  7. You can find more information on http://presbrey.mit.edu/PyDTA and
  8. https://www.statsmodels.org/devel/
  9. """
  10. from __future__ import annotations
  11. from collections import abc
  12. from datetime import (
  13. datetime,
  14. timedelta,
  15. )
  16. from io import BytesIO
  17. import os
  18. import struct
  19. import sys
  20. from typing import (
  21. IO,
  22. TYPE_CHECKING,
  23. AnyStr,
  24. Callable,
  25. Final,
  26. cast,
  27. )
  28. import warnings
  29. import numpy as np
  30. from pandas._libs import lib
  31. from pandas._libs.lib import infer_dtype
  32. from pandas._libs.writers import max_len_string_array
  33. from pandas.errors import (
  34. CategoricalConversionWarning,
  35. InvalidColumnName,
  36. PossiblePrecisionLoss,
  37. ValueLabelTypeMismatch,
  38. )
  39. from pandas.util._decorators import (
  40. Appender,
  41. doc,
  42. )
  43. from pandas.util._exceptions import find_stack_level
  44. from pandas.core.dtypes.base import ExtensionDtype
  45. from pandas.core.dtypes.common import (
  46. ensure_object,
  47. is_numeric_dtype,
  48. is_string_dtype,
  49. )
  50. from pandas.core.dtypes.dtypes import CategoricalDtype
  51. from pandas import (
  52. Categorical,
  53. DatetimeIndex,
  54. NaT,
  55. Timestamp,
  56. isna,
  57. to_datetime,
  58. to_timedelta,
  59. )
  60. from pandas.core.frame import DataFrame
  61. from pandas.core.indexes.base import Index
  62. from pandas.core.indexes.range import RangeIndex
  63. from pandas.core.series import Series
  64. from pandas.core.shared_docs import _shared_docs
  65. from pandas.io.common import get_handle
  66. if TYPE_CHECKING:
  67. from collections.abc import (
  68. Hashable,
  69. Sequence,
  70. )
  71. from types import TracebackType
  72. from typing import Literal
  73. from pandas._typing import (
  74. CompressionOptions,
  75. FilePath,
  76. ReadBuffer,
  77. Self,
  78. StorageOptions,
  79. WriteBuffer,
  80. )
  81. _version_error = (
  82. "Version of given Stata file is {version}. pandas supports importing "
  83. "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
  84. "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
  85. "and 119 (Stata 15/16, over 32,767 variables)."
  86. )
  87. _statafile_processing_params1 = """\
  88. convert_dates : bool, default True
  89. Convert date variables to DataFrame time values.
  90. convert_categoricals : bool, default True
  91. Read value labels and convert columns to Categorical/Factor variables."""
  92. _statafile_processing_params2 = """\
  93. index_col : str, optional
  94. Column to set as index.
  95. convert_missing : bool, default False
  96. Flag indicating whether to convert missing values to their Stata
  97. representations. If False, missing values are replaced with nan.
  98. If True, columns containing missing values are returned with
  99. object data types and missing values are represented by
  100. StataMissingValue objects.
  101. preserve_dtypes : bool, default True
  102. Preserve Stata datatypes. If False, numeric data are upcast to pandas
  103. default types for foreign data (float64 or int64).
  104. columns : list or None
  105. Columns to retain. Columns will be returned in the given order. None
  106. returns all columns.
  107. order_categoricals : bool, default True
  108. Flag indicating whether converted categorical data are ordered."""
  109. _chunksize_params = """\
  110. chunksize : int, default None
  111. Return StataReader object for iterations, returns chunks with
  112. given number of lines."""
  113. _iterator_params = """\
  114. iterator : bool, default False
  115. Return StataReader object."""
  116. _reader_notes = """\
  117. Notes
  118. -----
  119. Categorical variables read through an iterator may not have the same
  120. categories and dtype. This occurs when a variable stored in a DTA
  121. file is associated to an incomplete set of value labels that only
  122. label a strict subset of the values."""
  123. _read_stata_doc = f"""
  124. Read Stata file into DataFrame.
  125. Parameters
  126. ----------
  127. filepath_or_buffer : str, path object or file-like object
  128. Any valid string path is acceptable. The string could be a URL. Valid
  129. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  130. expected. A local file could be: ``file://localhost/path/to/table.dta``.
  131. If you want to pass in a path object, pandas accepts any ``os.PathLike``.
  132. By file-like object, we refer to objects with a ``read()`` method,
  133. such as a file handle (e.g. via builtin ``open`` function)
  134. or ``StringIO``.
  135. {_statafile_processing_params1}
  136. {_statafile_processing_params2}
  137. {_chunksize_params}
  138. {_iterator_params}
  139. {_shared_docs["decompression_options"] % "filepath_or_buffer"}
  140. {_shared_docs["storage_options"]}
  141. Returns
  142. -------
  143. DataFrame or pandas.api.typing.StataReader
  144. See Also
  145. --------
  146. io.stata.StataReader : Low-level reader for Stata data files.
  147. DataFrame.to_stata: Export Stata data files.
  148. {_reader_notes}
  149. Examples
  150. --------
  151. Creating a dummy stata for this example
  152. >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
  153. ... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP
  154. >>> df.to_stata('animals.dta') # doctest: +SKIP
  155. Read a Stata dta file:
  156. >>> df = pd.read_stata('animals.dta') # doctest: +SKIP
  157. Read a Stata dta file in 10,000 line chunks:
  158. >>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
  159. >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
  160. >>> df.to_stata('filename.dta') # doctest: +SKIP
  161. >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
  162. >>> for chunk in itr:
  163. ... # Operate on a single chunk, e.g., chunk.mean()
  164. ... pass # doctest: +SKIP
  165. """
  166. _read_method_doc = f"""\
  167. Reads observations from Stata file, converting them into a dataframe
  168. Parameters
  169. ----------
  170. nrows : int
  171. Number of lines to read from data file, if None read whole file.
  172. {_statafile_processing_params1}
  173. {_statafile_processing_params2}
  174. Returns
  175. -------
  176. DataFrame
  177. """
  178. _stata_reader_doc = f"""\
  179. Class for reading Stata dta files.
  180. Parameters
  181. ----------
  182. path_or_buf : path (string), buffer or path object
  183. string, path object (pathlib.Path or py._path.local.LocalPath) or object
  184. implementing a binary read() functions.
  185. {_statafile_processing_params1}
  186. {_statafile_processing_params2}
  187. {_chunksize_params}
  188. {_shared_docs["decompression_options"]}
  189. {_shared_docs["storage_options"]}
  190. {_reader_notes}
  191. """
  192. _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
  193. stata_epoch: Final = datetime(1960, 1, 1)
  194. def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
  195. """
  196. Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime
  197. Parameters
  198. ----------
  199. dates : Series
  200. The Stata Internal Format date to convert to datetime according to fmt
  201. fmt : str
  202. The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
  203. Returns
  204. Returns
  205. -------
  206. converted : Series
  207. The converted dates
  208. Examples
  209. --------
  210. >>> dates = pd.Series([52])
  211. >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
  212. 0 1961-01-01
  213. dtype: datetime64[ns]
  214. Notes
  215. -----
  216. datetime/c - tc
  217. milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
  218. datetime/C - tC - NOT IMPLEMENTED
  219. milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
  220. date - td
  221. days since 01jan1960 (01jan1960 = 0)
  222. weekly date - tw
  223. weeks since 1960w1
  224. This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
  225. The datetime value is the start of the week in terms of days in the
  226. year, not ISO calendar weeks.
  227. monthly date - tm
  228. months since 1960m1
  229. quarterly date - tq
  230. quarters since 1960q1
  231. half-yearly date - th
  232. half-years since 1960h1 yearly
  233. date - ty
  234. years since 0000
  235. """
  236. MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
  237. MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days
  238. MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days
  239. MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
  240. MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
  241. def convert_year_month_safe(year, month) -> Series:
  242. """
  243. Convert year and month to datetimes, using pandas vectorized versions
  244. when the date range falls within the range supported by pandas.
  245. Otherwise it falls back to a slower but more robust method
  246. using datetime.
  247. """
  248. if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
  249. return to_datetime(100 * year + month, format="%Y%m")
  250. else:
  251. index = getattr(year, "index", None)
  252. return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index)
  253. def convert_year_days_safe(year, days) -> Series:
  254. """
  255. Converts year (e.g. 1999) and days since the start of the year to a
  256. datetime or datetime64 Series
  257. """
  258. if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
  259. return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
  260. else:
  261. index = getattr(year, "index", None)
  262. value = [
  263. datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days)
  264. ]
  265. return Series(value, index=index)
  266. def convert_delta_safe(base, deltas, unit) -> Series:
  267. """
  268. Convert base dates and deltas to datetimes, using pandas vectorized
  269. versions if the deltas satisfy restrictions required to be expressed
  270. as dates in pandas.
  271. """
  272. index = getattr(deltas, "index", None)
  273. if unit == "d":
  274. if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
  275. values = [base + timedelta(days=int(d)) for d in deltas]
  276. return Series(values, index=index)
  277. elif unit == "ms":
  278. if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
  279. values = [
  280. base + timedelta(microseconds=(int(d) * 1000)) for d in deltas
  281. ]
  282. return Series(values, index=index)
  283. else:
  284. raise ValueError("format not understood")
  285. base = to_datetime(base)
  286. deltas = to_timedelta(deltas, unit=unit)
  287. return base + deltas
  288. # TODO(non-nano): If/when pandas supports more than datetime64[ns], this
  289. # should be improved to use correct range, e.g. datetime[Y] for yearly
  290. bad_locs = np.isnan(dates)
  291. has_bad_values = False
  292. if bad_locs.any():
  293. has_bad_values = True
  294. dates._values[bad_locs] = 1.0 # Replace with NaT
  295. dates = dates.astype(np.int64)
  296. if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
  297. base = stata_epoch
  298. ms = dates
  299. conv_dates = convert_delta_safe(base, ms, "ms")
  300. elif fmt.startswith(("%tC", "tC")):
  301. warnings.warn(
  302. "Encountered %tC format. Leaving in Stata Internal Format.",
  303. stacklevel=find_stack_level(),
  304. )
  305. conv_dates = Series(dates, dtype=object)
  306. if has_bad_values:
  307. conv_dates[bad_locs] = NaT
  308. return conv_dates
  309. # Delta days relative to base
  310. elif fmt.startswith(("%td", "td", "%d", "d")):
  311. base = stata_epoch
  312. days = dates
  313. conv_dates = convert_delta_safe(base, days, "d")
  314. # does not count leap days - 7 days is a week.
  315. # 52nd week may have more than 7 days
  316. elif fmt.startswith(("%tw", "tw")):
  317. year = stata_epoch.year + dates // 52
  318. days = (dates % 52) * 7
  319. conv_dates = convert_year_days_safe(year, days)
  320. elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
  321. year = stata_epoch.year + dates // 12
  322. month = (dates % 12) + 1
  323. conv_dates = convert_year_month_safe(year, month)
  324. elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
  325. year = stata_epoch.year + dates // 4
  326. quarter_month = (dates % 4) * 3 + 1
  327. conv_dates = convert_year_month_safe(year, quarter_month)
  328. elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
  329. year = stata_epoch.year + dates // 2
  330. month = (dates % 2) * 6 + 1
  331. conv_dates = convert_year_month_safe(year, month)
  332. elif fmt.startswith(("%ty", "ty")): # Years -- not delta
  333. year = dates
  334. first_month = np.ones_like(dates)
  335. conv_dates = convert_year_month_safe(year, first_month)
  336. else:
  337. raise ValueError(f"Date fmt {fmt} not understood")
  338. if has_bad_values: # Restore NaT for bad values
  339. conv_dates[bad_locs] = NaT
  340. return conv_dates
  341. def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
  342. """
  343. Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime
  344. Parameters
  345. ----------
  346. dates : Series
  347. Series or array containing datetime or datetime64[ns] to
  348. convert to the Stata Internal Format given by fmt
  349. fmt : str
  350. The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
  351. """
  352. index = dates.index
  353. NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
  354. US_PER_DAY = NS_PER_DAY / 1000
  355. def parse_dates_safe(
  356. dates: Series, delta: bool = False, year: bool = False, days: bool = False
  357. ):
  358. d = {}
  359. if lib.is_np_dtype(dates.dtype, "M"):
  360. if delta:
  361. time_delta = dates - Timestamp(stata_epoch).as_unit("ns")
  362. d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds
  363. if days or year:
  364. date_index = DatetimeIndex(dates)
  365. d["year"] = date_index._data.year
  366. d["month"] = date_index._data.month
  367. if days:
  368. days_in_ns = dates._values.view(np.int64) - to_datetime(
  369. d["year"], format="%Y"
  370. )._values.view(np.int64)
  371. d["days"] = days_in_ns // NS_PER_DAY
  372. elif infer_dtype(dates, skipna=False) == "datetime":
  373. if delta:
  374. delta = dates._values - stata_epoch
  375. def f(x: timedelta) -> float:
  376. return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
  377. v = np.vectorize(f)
  378. d["delta"] = v(delta)
  379. if year:
  380. year_month = dates.apply(lambda x: 100 * x.year + x.month)
  381. d["year"] = year_month._values // 100
  382. d["month"] = year_month._values - d["year"] * 100
  383. if days:
  384. def g(x: datetime) -> int:
  385. return (x - datetime(x.year, 1, 1)).days
  386. v = np.vectorize(g)
  387. d["days"] = v(dates)
  388. else:
  389. raise ValueError(
  390. "Columns containing dates must contain either "
  391. "datetime64, datetime or null values."
  392. )
  393. return DataFrame(d, index=index)
  394. bad_loc = isna(dates)
  395. index = dates.index
  396. if bad_loc.any():
  397. if lib.is_np_dtype(dates.dtype, "M"):
  398. dates._values[bad_loc] = to_datetime(stata_epoch)
  399. else:
  400. dates._values[bad_loc] = stata_epoch
  401. if fmt in ["%tc", "tc"]:
  402. d = parse_dates_safe(dates, delta=True)
  403. conv_dates = d.delta / 1000
  404. elif fmt in ["%tC", "tC"]:
  405. warnings.warn(
  406. "Stata Internal Format tC not supported.",
  407. stacklevel=find_stack_level(),
  408. )
  409. conv_dates = dates
  410. elif fmt in ["%td", "td"]:
  411. d = parse_dates_safe(dates, delta=True)
  412. conv_dates = d.delta // US_PER_DAY
  413. elif fmt in ["%tw", "tw"]:
  414. d = parse_dates_safe(dates, year=True, days=True)
  415. conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
  416. elif fmt in ["%tm", "tm"]:
  417. d = parse_dates_safe(dates, year=True)
  418. conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1
  419. elif fmt in ["%tq", "tq"]:
  420. d = parse_dates_safe(dates, year=True)
  421. conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
  422. elif fmt in ["%th", "th"]:
  423. d = parse_dates_safe(dates, year=True)
  424. conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int)
  425. elif fmt in ["%ty", "ty"]:
  426. d = parse_dates_safe(dates, year=True)
  427. conv_dates = d.year
  428. else:
  429. raise ValueError(f"Format {fmt} is not a known Stata date format")
  430. conv_dates = Series(conv_dates, dtype=np.float64, copy=False)
  431. missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
  432. conv_dates[bad_loc] = missing_value
  433. return Series(conv_dates, index=index, copy=False)
  434. excessive_string_length_error: Final = """
  435. Fixed width strings in Stata .dta files are limited to 244 (or fewer)
  436. characters. Column '{0}' does not satisfy this restriction. Use the
  437. 'version=117' parameter to write the newer (Stata 13 and later) format.
  438. """
  439. precision_loss_doc: Final = """
  440. Column converted from {0} to {1}, and some data are outside of the lossless
  441. conversion range. This may result in a loss of precision in the saved data.
  442. """
  443. value_label_mismatch_doc: Final = """
  444. Stata value labels (pandas categories) must be strings. Column {0} contains
  445. non-string labels which will be converted to strings. Please check that the
  446. Stata data file created has not lost information due to duplicate labels.
  447. """
  448. invalid_name_doc: Final = """
  449. Not all pandas column names were valid Stata variable names.
  450. The following replacements have been made:
  451. {0}
  452. If this is not what you expect, please make sure you have Stata-compliant
  453. column names in your DataFrame (strings only, max 32 characters, only
  454. alphanumerics and underscores, no Stata reserved words)
  455. """
  456. categorical_conversion_warning: Final = """
  457. One or more series with value labels are not fully labeled. Reading this
  458. dataset with an iterator results in categorical variable with different
  459. categories. This occurs since it is not possible to know all possible values
  460. until the entire dataset has been read. To avoid this warning, you can either
  461. read dataset without an iterator, or manually convert categorical data by
  462. ``convert_categoricals`` to False and then accessing the variable labels
  463. through the value_labels method of the reader.
  464. """
  465. def _cast_to_stata_types(data: DataFrame) -> DataFrame:
  466. """
  467. Checks the dtypes of the columns of a pandas DataFrame for
  468. compatibility with the data types and ranges supported by Stata, and
  469. converts if necessary.
  470. Parameters
  471. ----------
  472. data : DataFrame
  473. The DataFrame to check and convert
  474. Notes
  475. -----
  476. Numeric columns in Stata must be one of int8, int16, int32, float32 or
  477. float64, with some additional value restrictions. int8 and int16 columns
  478. are checked for violations of the value restrictions and upcast if needed.
  479. int64 data is not usable in Stata, and so it is downcast to int32 whenever
  480. the value are in the int32 range, and sidecast to float64 when larger than
  481. this range. If the int64 values are outside of the range of those
  482. perfectly representable as float64 values, a warning is raised.
  483. bool columns are cast to int8. uint columns are converted to int of the
  484. same size if there is no loss in precision, otherwise are upcast to a
  485. larger type. uint64 is currently not supported since it is concerted to
  486. object in a DataFrame.
  487. """
  488. ws = ""
  489. # original, if small, if large
  490. conversion_data: tuple[
  491. tuple[type, type, type],
  492. tuple[type, type, type],
  493. tuple[type, type, type],
  494. tuple[type, type, type],
  495. tuple[type, type, type],
  496. ] = (
  497. (np.bool_, np.int8, np.int8),
  498. (np.uint8, np.int8, np.int16),
  499. (np.uint16, np.int16, np.int32),
  500. (np.uint32, np.int32, np.int64),
  501. (np.uint64, np.int64, np.float64),
  502. )
  503. float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]
  504. float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]
  505. for col in data:
  506. # Cast from unsupported types to supported types
  507. is_nullable_int = (
  508. isinstance(data[col].dtype, ExtensionDtype)
  509. and data[col].dtype.kind in "iub"
  510. )
  511. # We need to find orig_missing before altering data below
  512. orig_missing = data[col].isna()
  513. if is_nullable_int:
  514. fv = 0 if data[col].dtype.kind in "iu" else False
  515. # Replace with NumPy-compatible column
  516. data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype)
  517. elif isinstance(data[col].dtype, ExtensionDtype):
  518. if getattr(data[col].dtype, "numpy_dtype", None) is not None:
  519. data[col] = data[col].astype(data[col].dtype.numpy_dtype)
  520. elif is_string_dtype(data[col].dtype):
  521. # TODO could avoid converting string dtype to object here,
  522. # but handle string dtype in _encode_strings
  523. data[col] = data[col].astype("object")
  524. # generate_table checks for None values
  525. data.loc[data[col].isna(), col] = None
  526. dtype = data[col].dtype
  527. empty_df = data.shape[0] == 0
  528. for c_data in conversion_data:
  529. if dtype == c_data[0]:
  530. if empty_df or data[col].max() <= np.iinfo(c_data[1]).max:
  531. dtype = c_data[1]
  532. else:
  533. dtype = c_data[2]
  534. if c_data[2] == np.int64: # Warn if necessary
  535. if data[col].max() >= 2**53:
  536. ws = precision_loss_doc.format("uint64", "float64")
  537. data[col] = data[col].astype(dtype)
  538. # Check values and upcast if necessary
  539. if dtype == np.int8 and not empty_df:
  540. if data[col].max() > 100 or data[col].min() < -127:
  541. data[col] = data[col].astype(np.int16)
  542. elif dtype == np.int16 and not empty_df:
  543. if data[col].max() > 32740 or data[col].min() < -32767:
  544. data[col] = data[col].astype(np.int32)
  545. elif dtype == np.int64:
  546. if empty_df or (
  547. data[col].max() <= 2147483620 and data[col].min() >= -2147483647
  548. ):
  549. data[col] = data[col].astype(np.int32)
  550. else:
  551. data[col] = data[col].astype(np.float64)
  552. if data[col].max() >= 2**53 or data[col].min() <= -(2**53):
  553. ws = precision_loss_doc.format("int64", "float64")
  554. elif dtype in (np.float32, np.float64):
  555. if np.isinf(data[col]).any():
  556. raise ValueError(
  557. f"Column {col} contains infinity or -infinity"
  558. "which is outside the range supported by Stata."
  559. )
  560. value = data[col].max()
  561. if dtype == np.float32 and value > float32_max:
  562. data[col] = data[col].astype(np.float64)
  563. elif dtype == np.float64:
  564. if value > float64_max:
  565. raise ValueError(
  566. f"Column {col} has a maximum value ({value}) outside the range "
  567. f"supported by Stata ({float64_max})"
  568. )
  569. if is_nullable_int:
  570. if orig_missing.any():
  571. # Replace missing by Stata sentinel value
  572. sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]
  573. data.loc[orig_missing, col] = sentinel
  574. if ws:
  575. warnings.warn(
  576. ws,
  577. PossiblePrecisionLoss,
  578. stacklevel=find_stack_level(),
  579. )
  580. return data
  581. class StataValueLabel:
  582. """
  583. Parse a categorical column and prepare formatted output
  584. Parameters
  585. ----------
  586. catarray : Series
  587. Categorical Series to encode
  588. encoding : {"latin-1", "utf-8"}
  589. Encoding to use for value labels.
  590. """
  591. def __init__(
  592. self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1"
  593. ) -> None:
  594. if encoding not in ("latin-1", "utf-8"):
  595. raise ValueError("Only latin-1 and utf-8 are supported.")
  596. self.labname = catarray.name
  597. self._encoding = encoding
  598. categories = catarray.cat.categories
  599. self.value_labels = enumerate(categories)
  600. self._prepare_value_labels()
  601. def _prepare_value_labels(self) -> None:
  602. """Encode value labels."""
  603. self.text_len = 0
  604. self.txt: list[bytes] = []
  605. self.n = 0
  606. # Offsets (length of categories), converted to int32
  607. self.off = np.array([], dtype=np.int32)
  608. # Values, converted to int32
  609. self.val = np.array([], dtype=np.int32)
  610. self.len = 0
  611. # Compute lengths and setup lists of offsets and labels
  612. offsets: list[int] = []
  613. values: list[float] = []
  614. for vl in self.value_labels:
  615. category: str | bytes = vl[1]
  616. if not isinstance(category, str):
  617. category = str(category)
  618. warnings.warn(
  619. value_label_mismatch_doc.format(self.labname),
  620. ValueLabelTypeMismatch,
  621. stacklevel=find_stack_level(),
  622. )
  623. category = category.encode(self._encoding)
  624. offsets.append(self.text_len)
  625. self.text_len += len(category) + 1 # +1 for the padding
  626. values.append(vl[0])
  627. self.txt.append(category)
  628. self.n += 1
  629. if self.text_len > 32000:
  630. raise ValueError(
  631. "Stata value labels for a single variable must "
  632. "have a combined length less than 32,000 characters."
  633. )
  634. # Ensure int32
  635. self.off = np.array(offsets, dtype=np.int32)
  636. self.val = np.array(values, dtype=np.int32)
  637. # Total length
  638. self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
  639. def generate_value_label(self, byteorder: str) -> bytes:
  640. """
  641. Generate the binary representation of the value labels.
  642. Parameters
  643. ----------
  644. byteorder : str
  645. Byte order of the output
  646. Returns
  647. -------
  648. value_label : bytes
  649. Bytes containing the formatted value label
  650. """
  651. encoding = self._encoding
  652. bio = BytesIO()
  653. null_byte = b"\x00"
  654. # len
  655. bio.write(struct.pack(byteorder + "i", self.len))
  656. # labname
  657. labname = str(self.labname)[:32].encode(encoding)
  658. lab_len = 32 if encoding not in ("utf-8", "utf8") else 128
  659. labname = _pad_bytes(labname, lab_len + 1)
  660. bio.write(labname)
  661. # padding - 3 bytes
  662. for i in range(3):
  663. bio.write(struct.pack("c", null_byte))
  664. # value_label_table
  665. # n - int32
  666. bio.write(struct.pack(byteorder + "i", self.n))
  667. # textlen - int32
  668. bio.write(struct.pack(byteorder + "i", self.text_len))
  669. # off - int32 array (n elements)
  670. for offset in self.off:
  671. bio.write(struct.pack(byteorder + "i", offset))
  672. # val - int32 array (n elements)
  673. for value in self.val:
  674. bio.write(struct.pack(byteorder + "i", value))
  675. # txt - Text labels, null terminated
  676. for text in self.txt:
  677. bio.write(text + null_byte)
  678. return bio.getvalue()
  679. class StataNonCatValueLabel(StataValueLabel):
  680. """
  681. Prepare formatted version of value labels
  682. Parameters
  683. ----------
  684. labname : str
  685. Value label name
  686. value_labels: Dictionary
  687. Mapping of values to labels
  688. encoding : {"latin-1", "utf-8"}
  689. Encoding to use for value labels.
  690. """
  691. def __init__(
  692. self,
  693. labname: str,
  694. value_labels: dict[float, str],
  695. encoding: Literal["latin-1", "utf-8"] = "latin-1",
  696. ) -> None:
  697. if encoding not in ("latin-1", "utf-8"):
  698. raise ValueError("Only latin-1 and utf-8 are supported.")
  699. self.labname = labname
  700. self._encoding = encoding
  701. self.value_labels = sorted( # type: ignore[assignment]
  702. value_labels.items(), key=lambda x: x[0]
  703. )
  704. self._prepare_value_labels()
  705. class StataMissingValue:
  706. """
  707. An observation's missing value.
  708. Parameters
  709. ----------
  710. value : {int, float}
  711. The Stata missing value code
  712. Notes
  713. -----
  714. More information: <https://www.stata.com/help.cgi?missing>
  715. Integer missing values make the code '.', '.a', ..., '.z' to the ranges
  716. 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...
  717. 2147483647 (for int32). Missing values for floating point data types are
  718. more complex but the pattern is simple to discern from the following table.
  719. np.float32 missing values (float in Stata)
  720. 0000007f .
  721. 0008007f .a
  722. 0010007f .b
  723. ...
  724. 00c0007f .x
  725. 00c8007f .y
  726. 00d0007f .z
  727. np.float64 missing values (double in Stata)
  728. 000000000000e07f .
  729. 000000000001e07f .a
  730. 000000000002e07f .b
  731. ...
  732. 000000000018e07f .x
  733. 000000000019e07f .y
  734. 00000000001ae07f .z
  735. """
  736. # Construct a dictionary of missing values
  737. MISSING_VALUES: dict[float, str] = {}
  738. bases: Final = (101, 32741, 2147483621)
  739. for b in bases:
  740. # Conversion to long to avoid hash issues on 32 bit platforms #8968
  741. MISSING_VALUES[b] = "."
  742. for i in range(1, 27):
  743. MISSING_VALUES[i + b] = "." + chr(96 + i)
  744. float32_base: bytes = b"\x00\x00\x00\x7f"
  745. increment_32: int = struct.unpack("<i", b"\x00\x08\x00\x00")[0]
  746. for i in range(27):
  747. key = struct.unpack("<f", float32_base)[0]
  748. MISSING_VALUES[key] = "."
  749. if i > 0:
  750. MISSING_VALUES[key] += chr(96 + i)
  751. int_value = struct.unpack("<i", struct.pack("<f", key))[0] + increment_32
  752. float32_base = struct.pack("<i", int_value)
  753. float64_base: bytes = b"\x00\x00\x00\x00\x00\x00\xe0\x7f"
  754. increment_64 = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0]
  755. for i in range(27):
  756. key = struct.unpack("<d", float64_base)[0]
  757. MISSING_VALUES[key] = "."
  758. if i > 0:
  759. MISSING_VALUES[key] += chr(96 + i)
  760. int_value = struct.unpack("q", struct.pack("<d", key))[0] + increment_64
  761. float64_base = struct.pack("q", int_value)
  762. BASE_MISSING_VALUES: Final = {
  763. "int8": 101,
  764. "int16": 32741,
  765. "int32": 2147483621,
  766. "float32": struct.unpack("<f", float32_base)[0],
  767. "float64": struct.unpack("<d", float64_base)[0],
  768. }
  769. def __init__(self, value: float) -> None:
  770. self._value = value
  771. # Conversion to int to avoid hash issues on 32 bit platforms #8968
  772. value = int(value) if value < 2147483648 else float(value)
  773. self._str = self.MISSING_VALUES[value]
  774. @property
  775. def string(self) -> str:
  776. """
  777. The Stata representation of the missing value: '.', '.a'..'.z'
  778. Returns
  779. -------
  780. str
  781. The representation of the missing value.
  782. """
  783. return self._str
  784. @property
  785. def value(self) -> float:
  786. """
  787. The binary representation of the missing value.
  788. Returns
  789. -------
  790. {int, float}
  791. The binary representation of the missing value.
  792. """
  793. return self._value
  794. def __str__(self) -> str:
  795. return self.string
  796. def __repr__(self) -> str:
  797. return f"{type(self)}({self})"
  798. def __eq__(self, other: object) -> bool:
  799. return (
  800. isinstance(other, type(self))
  801. and self.string == other.string
  802. and self.value == other.value
  803. )
  804. @classmethod
  805. def get_base_missing_value(cls, dtype: np.dtype) -> float:
  806. if dtype.type is np.int8:
  807. value = cls.BASE_MISSING_VALUES["int8"]
  808. elif dtype.type is np.int16:
  809. value = cls.BASE_MISSING_VALUES["int16"]
  810. elif dtype.type is np.int32:
  811. value = cls.BASE_MISSING_VALUES["int32"]
  812. elif dtype.type is np.float32:
  813. value = cls.BASE_MISSING_VALUES["float32"]
  814. elif dtype.type is np.float64:
  815. value = cls.BASE_MISSING_VALUES["float64"]
  816. else:
  817. raise ValueError("Unsupported dtype")
  818. return value
  819. class StataParser:
  820. def __init__(self) -> None:
  821. # type code.
  822. # --------------------
  823. # str1 1 = 0x01
  824. # str2 2 = 0x02
  825. # ...
  826. # str244 244 = 0xf4
  827. # byte 251 = 0xfb (sic)
  828. # int 252 = 0xfc
  829. # long 253 = 0xfd
  830. # float 254 = 0xfe
  831. # double 255 = 0xff
  832. # --------------------
  833. # NOTE: the byte type seems to be reserved for categorical variables
  834. # with a label, but the underlying variable is -127 to 100
  835. # we're going to drop the label and cast to int
  836. self.DTYPE_MAP = dict(
  837. [(i, np.dtype(f"S{i}")) for i in range(1, 245)]
  838. + [
  839. (251, np.dtype(np.int8)),
  840. (252, np.dtype(np.int16)),
  841. (253, np.dtype(np.int32)),
  842. (254, np.dtype(np.float32)),
  843. (255, np.dtype(np.float64)),
  844. ]
  845. )
  846. self.DTYPE_MAP_XML: dict[int, np.dtype] = {
  847. 32768: np.dtype(np.uint8), # Keys to GSO
  848. 65526: np.dtype(np.float64),
  849. 65527: np.dtype(np.float32),
  850. 65528: np.dtype(np.int32),
  851. 65529: np.dtype(np.int16),
  852. 65530: np.dtype(np.int8),
  853. }
  854. self.TYPE_MAP = list(tuple(range(251)) + tuple("bhlfd"))
  855. self.TYPE_MAP_XML = {
  856. # Not really a Q, unclear how to handle byteswap
  857. 32768: "Q",
  858. 65526: "d",
  859. 65527: "f",
  860. 65528: "l",
  861. 65529: "h",
  862. 65530: "b",
  863. }
  864. # NOTE: technically, some of these are wrong. there are more numbers
  865. # that can be represented. it's the 27 ABOVE and BELOW the max listed
  866. # numeric data type in [U] 12.2.2 of the 11.2 manual
  867. float32_min = b"\xff\xff\xff\xfe"
  868. float32_max = b"\xff\xff\xff\x7e"
  869. float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff"
  870. float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f"
  871. self.VALID_RANGE = {
  872. "b": (-127, 100),
  873. "h": (-32767, 32740),
  874. "l": (-2147483647, 2147483620),
  875. "f": (
  876. np.float32(struct.unpack("<f", float32_min)[0]),
  877. np.float32(struct.unpack("<f", float32_max)[0]),
  878. ),
  879. "d": (
  880. np.float64(struct.unpack("<d", float64_min)[0]),
  881. np.float64(struct.unpack("<d", float64_max)[0]),
  882. ),
  883. }
  884. self.OLD_TYPE_MAPPING = {
  885. 98: 251, # byte
  886. 105: 252, # int
  887. 108: 253, # long
  888. 102: 254, # float
  889. 100: 255, # double
  890. }
  891. # These missing values are the generic '.' in Stata, and are used
  892. # to replace nans
  893. self.MISSING_VALUES = {
  894. "b": 101,
  895. "h": 32741,
  896. "l": 2147483621,
  897. "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]),
  898. "d": np.float64(
  899. struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
  900. ),
  901. }
  902. self.NUMPY_TYPE_MAP = {
  903. "b": "i1",
  904. "h": "i2",
  905. "l": "i4",
  906. "f": "f4",
  907. "d": "f8",
  908. "Q": "u8",
  909. }
  910. # Reserved words cannot be used as variable names
  911. self.RESERVED_WORDS = {
  912. "aggregate",
  913. "array",
  914. "boolean",
  915. "break",
  916. "byte",
  917. "case",
  918. "catch",
  919. "class",
  920. "colvector",
  921. "complex",
  922. "const",
  923. "continue",
  924. "default",
  925. "delegate",
  926. "delete",
  927. "do",
  928. "double",
  929. "else",
  930. "eltypedef",
  931. "end",
  932. "enum",
  933. "explicit",
  934. "export",
  935. "external",
  936. "float",
  937. "for",
  938. "friend",
  939. "function",
  940. "global",
  941. "goto",
  942. "if",
  943. "inline",
  944. "int",
  945. "local",
  946. "long",
  947. "NULL",
  948. "pragma",
  949. "protected",
  950. "quad",
  951. "rowvector",
  952. "short",
  953. "typedef",
  954. "typename",
  955. "virtual",
  956. "_all",
  957. "_N",
  958. "_skip",
  959. "_b",
  960. "_pi",
  961. "str#",
  962. "in",
  963. "_pred",
  964. "strL",
  965. "_coef",
  966. "_rc",
  967. "using",
  968. "_cons",
  969. "_se",
  970. "with",
  971. "_n",
  972. }
  973. class StataReader(StataParser, abc.Iterator):
  974. __doc__ = _stata_reader_doc
  975. _path_or_buf: IO[bytes] # Will be assigned by `_open_file`.
  976. def __init__(
  977. self,
  978. path_or_buf: FilePath | ReadBuffer[bytes],
  979. convert_dates: bool = True,
  980. convert_categoricals: bool = True,
  981. index_col: str | None = None,
  982. convert_missing: bool = False,
  983. preserve_dtypes: bool = True,
  984. columns: Sequence[str] | None = None,
  985. order_categoricals: bool = True,
  986. chunksize: int | None = None,
  987. compression: CompressionOptions = "infer",
  988. storage_options: StorageOptions | None = None,
  989. ) -> None:
  990. super().__init__()
  991. # Arguments to the reader (can be temporarily overridden in
  992. # calls to read).
  993. self._convert_dates = convert_dates
  994. self._convert_categoricals = convert_categoricals
  995. self._index_col = index_col
  996. self._convert_missing = convert_missing
  997. self._preserve_dtypes = preserve_dtypes
  998. self._columns = columns
  999. self._order_categoricals = order_categoricals
  1000. self._original_path_or_buf = path_or_buf
  1001. self._compression = compression
  1002. self._storage_options = storage_options
  1003. self._encoding = ""
  1004. self._chunksize = chunksize
  1005. self._using_iterator = False
  1006. self._entered = False
  1007. if self._chunksize is None:
  1008. self._chunksize = 1
  1009. elif not isinstance(chunksize, int) or chunksize <= 0:
  1010. raise ValueError("chunksize must be a positive integer when set.")
  1011. # State variables for the file
  1012. self._close_file: Callable[[], None] | None = None
  1013. self._missing_values = False
  1014. self._can_read_value_labels = False
  1015. self._column_selector_set = False
  1016. self._value_labels_read = False
  1017. self._data_read = False
  1018. self._dtype: np.dtype | None = None
  1019. self._lines_read = 0
  1020. self._native_byteorder = _set_endianness(sys.byteorder)
  1021. def _ensure_open(self) -> None:
  1022. """
  1023. Ensure the file has been opened and its header data read.
  1024. """
  1025. if not hasattr(self, "_path_or_buf"):
  1026. self._open_file()
  1027. def _open_file(self) -> None:
  1028. """
  1029. Open the file (with compression options, etc.), and read header information.
  1030. """
  1031. if not self._entered:
  1032. warnings.warn(
  1033. "StataReader is being used without using a context manager. "
  1034. "Using StataReader as a context manager is the only supported method.",
  1035. ResourceWarning,
  1036. stacklevel=find_stack_level(),
  1037. )
  1038. handles = get_handle(
  1039. self._original_path_or_buf,
  1040. "rb",
  1041. storage_options=self._storage_options,
  1042. is_text=False,
  1043. compression=self._compression,
  1044. )
  1045. if hasattr(handles.handle, "seekable") and handles.handle.seekable():
  1046. # If the handle is directly seekable, use it without an extra copy.
  1047. self._path_or_buf = handles.handle
  1048. self._close_file = handles.close
  1049. else:
  1050. # Copy to memory, and ensure no encoding.
  1051. with handles:
  1052. self._path_or_buf = BytesIO(handles.handle.read())
  1053. self._close_file = self._path_or_buf.close
  1054. self._read_header()
  1055. self._setup_dtype()
  1056. def __enter__(self) -> Self:
  1057. """enter context manager"""
  1058. self._entered = True
  1059. return self
  1060. def __exit__(
  1061. self,
  1062. exc_type: type[BaseException] | None,
  1063. exc_value: BaseException | None,
  1064. traceback: TracebackType | None,
  1065. ) -> None:
  1066. if self._close_file:
  1067. self._close_file()
  1068. def close(self) -> None:
  1069. """Close the handle if its open.
  1070. .. deprecated: 2.0.0
  1071. The close method is not part of the public API.
  1072. The only supported way to use StataReader is to use it as a context manager.
  1073. """
  1074. warnings.warn(
  1075. "The StataReader.close() method is not part of the public API and "
  1076. "will be removed in a future version without notice. "
  1077. "Using StataReader as a context manager is the only supported method.",
  1078. FutureWarning,
  1079. stacklevel=find_stack_level(),
  1080. )
  1081. if self._close_file:
  1082. self._close_file()
  1083. def _set_encoding(self) -> None:
  1084. """
  1085. Set string encoding which depends on file version
  1086. """
  1087. if self._format_version < 118:
  1088. self._encoding = "latin-1"
  1089. else:
  1090. self._encoding = "utf-8"
  1091. def _read_int8(self) -> int:
  1092. return struct.unpack("b", self._path_or_buf.read(1))[0]
  1093. def _read_uint8(self) -> int:
  1094. return struct.unpack("B", self._path_or_buf.read(1))[0]
  1095. def _read_uint16(self) -> int:
  1096. return struct.unpack(f"{self._byteorder}H", self._path_or_buf.read(2))[0]
  1097. def _read_uint32(self) -> int:
  1098. return struct.unpack(f"{self._byteorder}I", self._path_or_buf.read(4))[0]
  1099. def _read_uint64(self) -> int:
  1100. return struct.unpack(f"{self._byteorder}Q", self._path_or_buf.read(8))[0]
  1101. def _read_int16(self) -> int:
  1102. return struct.unpack(f"{self._byteorder}h", self._path_or_buf.read(2))[0]
  1103. def _read_int32(self) -> int:
  1104. return struct.unpack(f"{self._byteorder}i", self._path_or_buf.read(4))[0]
  1105. def _read_int64(self) -> int:
  1106. return struct.unpack(f"{self._byteorder}q", self._path_or_buf.read(8))[0]
  1107. def _read_char8(self) -> bytes:
  1108. return struct.unpack("c", self._path_or_buf.read(1))[0]
  1109. def _read_int16_count(self, count: int) -> tuple[int, ...]:
  1110. return struct.unpack(
  1111. f"{self._byteorder}{'h' * count}",
  1112. self._path_or_buf.read(2 * count),
  1113. )
  1114. def _read_header(self) -> None:
  1115. first_char = self._read_char8()
  1116. if first_char == b"<":
  1117. self._read_new_header()
  1118. else:
  1119. self._read_old_header(first_char)
  1120. def _read_new_header(self) -> None:
  1121. # The first part of the header is common to 117 - 119.
  1122. self._path_or_buf.read(27) # stata_dta><header><release>
  1123. self._format_version = int(self._path_or_buf.read(3))
  1124. if self._format_version not in [117, 118, 119]:
  1125. raise ValueError(_version_error.format(version=self._format_version))
  1126. self._set_encoding()
  1127. self._path_or_buf.read(21) # </release><byteorder>
  1128. self._byteorder = ">" if self._path_or_buf.read(3) == b"MSF" else "<"
  1129. self._path_or_buf.read(15) # </byteorder><K>
  1130. self._nvar = (
  1131. self._read_uint16() if self._format_version <= 118 else self._read_uint32()
  1132. )
  1133. self._path_or_buf.read(7) # </K><N>
  1134. self._nobs = self._get_nobs()
  1135. self._path_or_buf.read(11) # </N><label>
  1136. self._data_label = self._get_data_label()
  1137. self._path_or_buf.read(19) # </label><timestamp>
  1138. self._time_stamp = self._get_time_stamp()
  1139. self._path_or_buf.read(26) # </timestamp></header><map>
  1140. self._path_or_buf.read(8) # 0x0000000000000000
  1141. self._path_or_buf.read(8) # position of <map>
  1142. self._seek_vartypes = self._read_int64() + 16
  1143. self._seek_varnames = self._read_int64() + 10
  1144. self._seek_sortlist = self._read_int64() + 10
  1145. self._seek_formats = self._read_int64() + 9
  1146. self._seek_value_label_names = self._read_int64() + 19
  1147. # Requires version-specific treatment
  1148. self._seek_variable_labels = self._get_seek_variable_labels()
  1149. self._path_or_buf.read(8) # <characteristics>
  1150. self._data_location = self._read_int64() + 6
  1151. self._seek_strls = self._read_int64() + 7
  1152. self._seek_value_labels = self._read_int64() + 14
  1153. self._typlist, self._dtyplist = self._get_dtypes(self._seek_vartypes)
  1154. self._path_or_buf.seek(self._seek_varnames)
  1155. self._varlist = self._get_varlist()
  1156. self._path_or_buf.seek(self._seek_sortlist)
  1157. self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]
  1158. self._path_or_buf.seek(self._seek_formats)
  1159. self._fmtlist = self._get_fmtlist()
  1160. self._path_or_buf.seek(self._seek_value_label_names)
  1161. self._lbllist = self._get_lbllist()
  1162. self._path_or_buf.seek(self._seek_variable_labels)
  1163. self._variable_labels = self._get_variable_labels()
  1164. # Get data type information, works for versions 117-119.
  1165. def _get_dtypes(
  1166. self, seek_vartypes: int
  1167. ) -> tuple[list[int | str], list[str | np.dtype]]:
  1168. self._path_or_buf.seek(seek_vartypes)
  1169. typlist = []
  1170. dtyplist = []
  1171. for _ in range(self._nvar):
  1172. typ = self._read_uint16()
  1173. if typ <= 2045:
  1174. typlist.append(typ)
  1175. dtyplist.append(str(typ))
  1176. else:
  1177. try:
  1178. typlist.append(self.TYPE_MAP_XML[typ]) # type: ignore[arg-type]
  1179. dtyplist.append(self.DTYPE_MAP_XML[typ]) # type: ignore[arg-type]
  1180. except KeyError as err:
  1181. raise ValueError(f"cannot convert stata types [{typ}]") from err
  1182. return typlist, dtyplist # type: ignore[return-value]
  1183. def _get_varlist(self) -> list[str]:
  1184. # 33 in order formats, 129 in formats 118 and 119
  1185. b = 33 if self._format_version < 118 else 129
  1186. return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
  1187. # Returns the format list
  1188. def _get_fmtlist(self) -> list[str]:
  1189. if self._format_version >= 118:
  1190. b = 57
  1191. elif self._format_version > 113:
  1192. b = 49
  1193. elif self._format_version > 104:
  1194. b = 12
  1195. else:
  1196. b = 7
  1197. return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
  1198. # Returns the label list
  1199. def _get_lbllist(self) -> list[str]:
  1200. if self._format_version >= 118:
  1201. b = 129
  1202. elif self._format_version > 108:
  1203. b = 33
  1204. else:
  1205. b = 9
  1206. return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
  1207. def _get_variable_labels(self) -> list[str]:
  1208. if self._format_version >= 118:
  1209. vlblist = [
  1210. self._decode(self._path_or_buf.read(321)) for _ in range(self._nvar)
  1211. ]
  1212. elif self._format_version > 105:
  1213. vlblist = [
  1214. self._decode(self._path_or_buf.read(81)) for _ in range(self._nvar)
  1215. ]
  1216. else:
  1217. vlblist = [
  1218. self._decode(self._path_or_buf.read(32)) for _ in range(self._nvar)
  1219. ]
  1220. return vlblist
  1221. def _get_nobs(self) -> int:
  1222. if self._format_version >= 118:
  1223. return self._read_uint64()
  1224. else:
  1225. return self._read_uint32()
  1226. def _get_data_label(self) -> str:
  1227. if self._format_version >= 118:
  1228. strlen = self._read_uint16()
  1229. return self._decode(self._path_or_buf.read(strlen))
  1230. elif self._format_version == 117:
  1231. strlen = self._read_int8()
  1232. return self._decode(self._path_or_buf.read(strlen))
  1233. elif self._format_version > 105:
  1234. return self._decode(self._path_or_buf.read(81))
  1235. else:
  1236. return self._decode(self._path_or_buf.read(32))
  1237. def _get_time_stamp(self) -> str:
  1238. if self._format_version >= 118:
  1239. strlen = self._read_int8()
  1240. return self._path_or_buf.read(strlen).decode("utf-8")
  1241. elif self._format_version == 117:
  1242. strlen = self._read_int8()
  1243. return self._decode(self._path_or_buf.read(strlen))
  1244. elif self._format_version > 104:
  1245. return self._decode(self._path_or_buf.read(18))
  1246. else:
  1247. raise ValueError()
  1248. def _get_seek_variable_labels(self) -> int:
  1249. if self._format_version == 117:
  1250. self._path_or_buf.read(8) # <variable_labels>, throw away
  1251. # Stata 117 data files do not follow the described format. This is
  1252. # a work around that uses the previous label, 33 bytes for each
  1253. # variable, 20 for the closing tag and 17 for the opening tag
  1254. return self._seek_value_label_names + (33 * self._nvar) + 20 + 17
  1255. elif self._format_version >= 118:
  1256. return self._read_int64() + 17
  1257. else:
  1258. raise ValueError()
  1259. def _read_old_header(self, first_char: bytes) -> None:
  1260. self._format_version = int(first_char[0])
  1261. if self._format_version not in [104, 105, 108, 111, 113, 114, 115]:
  1262. raise ValueError(_version_error.format(version=self._format_version))
  1263. self._set_encoding()
  1264. self._byteorder = ">" if self._read_int8() == 0x1 else "<"
  1265. self._filetype = self._read_int8()
  1266. self._path_or_buf.read(1) # unused
  1267. self._nvar = self._read_uint16()
  1268. self._nobs = self._get_nobs()
  1269. self._data_label = self._get_data_label()
  1270. self._time_stamp = self._get_time_stamp()
  1271. # descriptors
  1272. if self._format_version > 108:
  1273. typlist = [int(c) for c in self._path_or_buf.read(self._nvar)]
  1274. else:
  1275. buf = self._path_or_buf.read(self._nvar)
  1276. typlistb = np.frombuffer(buf, dtype=np.uint8)
  1277. typlist = []
  1278. for tp in typlistb:
  1279. if tp in self.OLD_TYPE_MAPPING:
  1280. typlist.append(self.OLD_TYPE_MAPPING[tp])
  1281. else:
  1282. typlist.append(tp - 127) # bytes
  1283. try:
  1284. self._typlist = [self.TYPE_MAP[typ] for typ in typlist]
  1285. except ValueError as err:
  1286. invalid_types = ",".join([str(x) for x in typlist])
  1287. raise ValueError(f"cannot convert stata types [{invalid_types}]") from err
  1288. try:
  1289. self._dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
  1290. except ValueError as err:
  1291. invalid_dtypes = ",".join([str(x) for x in typlist])
  1292. raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err
  1293. if self._format_version > 108:
  1294. self._varlist = [
  1295. self._decode(self._path_or_buf.read(33)) for _ in range(self._nvar)
  1296. ]
  1297. else:
  1298. self._varlist = [
  1299. self._decode(self._path_or_buf.read(9)) for _ in range(self._nvar)
  1300. ]
  1301. self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]
  1302. self._fmtlist = self._get_fmtlist()
  1303. self._lbllist = self._get_lbllist()
  1304. self._variable_labels = self._get_variable_labels()
  1305. # ignore expansion fields (Format 105 and later)
  1306. # When reading, read five bytes; the last four bytes now tell you
  1307. # the size of the next read, which you discard. You then continue
  1308. # like this until you read 5 bytes of zeros.
  1309. if self._format_version > 104:
  1310. while True:
  1311. data_type = self._read_int8()
  1312. if self._format_version > 108:
  1313. data_len = self._read_int32()
  1314. else:
  1315. data_len = self._read_int16()
  1316. if data_type == 0:
  1317. break
  1318. self._path_or_buf.read(data_len)
  1319. # necessary data to continue parsing
  1320. self._data_location = self._path_or_buf.tell()
  1321. def _setup_dtype(self) -> np.dtype:
  1322. """Map between numpy and state dtypes"""
  1323. if self._dtype is not None:
  1324. return self._dtype
  1325. dtypes = [] # Convert struct data types to numpy data type
  1326. for i, typ in enumerate(self._typlist):
  1327. if typ in self.NUMPY_TYPE_MAP:
  1328. typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP
  1329. dtypes.append((f"s{i}", f"{self._byteorder}{self.NUMPY_TYPE_MAP[typ]}"))
  1330. else:
  1331. dtypes.append((f"s{i}", f"S{typ}"))
  1332. self._dtype = np.dtype(dtypes)
  1333. return self._dtype
  1334. def _decode(self, s: bytes) -> str:
  1335. # have bytes not strings, so must decode
  1336. s = s.partition(b"\0")[0]
  1337. try:
  1338. return s.decode(self._encoding)
  1339. except UnicodeDecodeError:
  1340. # GH 25960, fallback to handle incorrect format produced when 117
  1341. # files are converted to 118 files in Stata
  1342. encoding = self._encoding
  1343. msg = f"""
  1344. One or more strings in the dta file could not be decoded using {encoding}, and
  1345. so the fallback encoding of latin-1 is being used. This can happen when a file
  1346. has been incorrectly encoded by Stata or some other software. You should verify
  1347. the string values returned are correct."""
  1348. warnings.warn(
  1349. msg,
  1350. UnicodeWarning,
  1351. stacklevel=find_stack_level(),
  1352. )
  1353. return s.decode("latin-1")
  1354. def _read_value_labels(self) -> None:
  1355. self._ensure_open()
  1356. if self._value_labels_read:
  1357. # Don't read twice
  1358. return
  1359. if self._format_version <= 108:
  1360. # Value labels are not supported in version 108 and earlier.
  1361. self._value_labels_read = True
  1362. self._value_label_dict: dict[str, dict[float, str]] = {}
  1363. return
  1364. if self._format_version >= 117:
  1365. self._path_or_buf.seek(self._seek_value_labels)
  1366. else:
  1367. assert self._dtype is not None
  1368. offset = self._nobs * self._dtype.itemsize
  1369. self._path_or_buf.seek(self._data_location + offset)
  1370. self._value_labels_read = True
  1371. self._value_label_dict = {}
  1372. while True:
  1373. if self._format_version >= 117:
  1374. if self._path_or_buf.read(5) == b"</val": # <lbl>
  1375. break # end of value label table
  1376. slength = self._path_or_buf.read(4)
  1377. if not slength:
  1378. break # end of value label table (format < 117)
  1379. if self._format_version <= 117:
  1380. labname = self._decode(self._path_or_buf.read(33))
  1381. else:
  1382. labname = self._decode(self._path_or_buf.read(129))
  1383. self._path_or_buf.read(3) # padding
  1384. n = self._read_uint32()
  1385. txtlen = self._read_uint32()
  1386. off = np.frombuffer(
  1387. self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
  1388. )
  1389. val = np.frombuffer(
  1390. self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
  1391. )
  1392. ii = np.argsort(off)
  1393. off = off[ii]
  1394. val = val[ii]
  1395. txt = self._path_or_buf.read(txtlen)
  1396. self._value_label_dict[labname] = {}
  1397. for i in range(n):
  1398. end = off[i + 1] if i < n - 1 else txtlen
  1399. self._value_label_dict[labname][val[i]] = self._decode(
  1400. txt[off[i] : end]
  1401. )
  1402. if self._format_version >= 117:
  1403. self._path_or_buf.read(6) # </lbl>
  1404. self._value_labels_read = True
  1405. def _read_strls(self) -> None:
  1406. self._path_or_buf.seek(self._seek_strls)
  1407. # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
  1408. self.GSO = {"0": ""}
  1409. while True:
  1410. if self._path_or_buf.read(3) != b"GSO":
  1411. break
  1412. if self._format_version == 117:
  1413. v_o = self._read_uint64()
  1414. else:
  1415. buf = self._path_or_buf.read(12)
  1416. # Only tested on little endian file on little endian machine.
  1417. v_size = 2 if self._format_version == 118 else 3
  1418. if self._byteorder == "<":
  1419. buf = buf[0:v_size] + buf[4 : (12 - v_size)]
  1420. else:
  1421. # This path may not be correct, impossible to test
  1422. buf = buf[0:v_size] + buf[(4 + v_size) :]
  1423. v_o = struct.unpack("Q", buf)[0]
  1424. typ = self._read_uint8()
  1425. length = self._read_uint32()
  1426. va = self._path_or_buf.read(length)
  1427. if typ == 130:
  1428. decoded_va = va[0:-1].decode(self._encoding)
  1429. else:
  1430. # Stata says typ 129 can be binary, so use str
  1431. decoded_va = str(va)
  1432. # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
  1433. self.GSO[str(v_o)] = decoded_va
  1434. def __next__(self) -> DataFrame:
  1435. self._using_iterator = True
  1436. return self.read(nrows=self._chunksize)
  1437. def get_chunk(self, size: int | None = None) -> DataFrame:
  1438. """
  1439. Reads lines from Stata file and returns as dataframe
  1440. Parameters
  1441. ----------
  1442. size : int, defaults to None
  1443. Number of lines to read. If None, reads whole file.
  1444. Returns
  1445. -------
  1446. DataFrame
  1447. """
  1448. if size is None:
  1449. size = self._chunksize
  1450. return self.read(nrows=size)
  1451. @Appender(_read_method_doc)
  1452. def read(
  1453. self,
  1454. nrows: int | None = None,
  1455. convert_dates: bool | None = None,
  1456. convert_categoricals: bool | None = None,
  1457. index_col: str | None = None,
  1458. convert_missing: bool | None = None,
  1459. preserve_dtypes: bool | None = None,
  1460. columns: Sequence[str] | None = None,
  1461. order_categoricals: bool | None = None,
  1462. ) -> DataFrame:
  1463. self._ensure_open()
  1464. # Handle options
  1465. if convert_dates is None:
  1466. convert_dates = self._convert_dates
  1467. if convert_categoricals is None:
  1468. convert_categoricals = self._convert_categoricals
  1469. if convert_missing is None:
  1470. convert_missing = self._convert_missing
  1471. if preserve_dtypes is None:
  1472. preserve_dtypes = self._preserve_dtypes
  1473. if columns is None:
  1474. columns = self._columns
  1475. if order_categoricals is None:
  1476. order_categoricals = self._order_categoricals
  1477. if index_col is None:
  1478. index_col = self._index_col
  1479. if nrows is None:
  1480. nrows = self._nobs
  1481. # Handle empty file or chunk. If reading incrementally raise
  1482. # StopIteration. If reading the whole thing return an empty
  1483. # data frame.
  1484. if (self._nobs == 0) and nrows == 0:
  1485. self._can_read_value_labels = True
  1486. self._data_read = True
  1487. data = DataFrame(columns=self._varlist)
  1488. # Apply dtypes correctly
  1489. for i, col in enumerate(data.columns):
  1490. dt = self._dtyplist[i]
  1491. if isinstance(dt, np.dtype):
  1492. if dt.char != "S":
  1493. data[col] = data[col].astype(dt)
  1494. if columns is not None:
  1495. data = self._do_select_columns(data, columns)
  1496. return data
  1497. if (self._format_version >= 117) and (not self._value_labels_read):
  1498. self._can_read_value_labels = True
  1499. self._read_strls()
  1500. # Read data
  1501. assert self._dtype is not None
  1502. dtype = self._dtype
  1503. max_read_len = (self._nobs - self._lines_read) * dtype.itemsize
  1504. read_len = nrows * dtype.itemsize
  1505. read_len = min(read_len, max_read_len)
  1506. if read_len <= 0:
  1507. # Iterator has finished, should never be here unless
  1508. # we are reading the file incrementally
  1509. if convert_categoricals:
  1510. self._read_value_labels()
  1511. raise StopIteration
  1512. offset = self._lines_read * dtype.itemsize
  1513. self._path_or_buf.seek(self._data_location + offset)
  1514. read_lines = min(nrows, self._nobs - self._lines_read)
  1515. raw_data = np.frombuffer(
  1516. self._path_or_buf.read(read_len), dtype=dtype, count=read_lines
  1517. )
  1518. self._lines_read += read_lines
  1519. if self._lines_read == self._nobs:
  1520. self._can_read_value_labels = True
  1521. self._data_read = True
  1522. # if necessary, swap the byte order to native here
  1523. if self._byteorder != self._native_byteorder:
  1524. raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder())
  1525. if convert_categoricals:
  1526. self._read_value_labels()
  1527. if len(raw_data) == 0:
  1528. data = DataFrame(columns=self._varlist)
  1529. else:
  1530. data = DataFrame.from_records(raw_data)
  1531. data.columns = Index(self._varlist)
  1532. # If index is not specified, use actual row number rather than
  1533. # restarting at 0 for each chunk.
  1534. if index_col is None:
  1535. data.index = RangeIndex(
  1536. self._lines_read - read_lines, self._lines_read
  1537. ) # set attr instead of set_index to avoid copy
  1538. if columns is not None:
  1539. data = self._do_select_columns(data, columns)
  1540. # Decode strings
  1541. for col, typ in zip(data, self._typlist):
  1542. if isinstance(typ, int):
  1543. data[col] = data[col].apply(self._decode)
  1544. data = self._insert_strls(data)
  1545. # Convert columns (if needed) to match input type
  1546. valid_dtypes = [i for i, dtyp in enumerate(self._dtyplist) if dtyp is not None]
  1547. object_type = np.dtype(object)
  1548. for idx in valid_dtypes:
  1549. dtype = data.iloc[:, idx].dtype
  1550. if dtype not in (object_type, self._dtyplist[idx]):
  1551. data.isetitem(idx, data.iloc[:, idx].astype(dtype))
  1552. data = self._do_convert_missing(data, convert_missing)
  1553. if convert_dates:
  1554. for i, fmt in enumerate(self._fmtlist):
  1555. if any(fmt.startswith(date_fmt) for date_fmt in _date_formats):
  1556. data.isetitem(
  1557. i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)
  1558. )
  1559. if convert_categoricals and self._format_version > 108:
  1560. data = self._do_convert_categoricals(
  1561. data, self._value_label_dict, self._lbllist, order_categoricals
  1562. )
  1563. if not preserve_dtypes:
  1564. retyped_data = []
  1565. convert = False
  1566. for col in data:
  1567. dtype = data[col].dtype
  1568. if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
  1569. dtype = np.dtype(np.float64)
  1570. convert = True
  1571. elif dtype in (
  1572. np.dtype(np.int8),
  1573. np.dtype(np.int16),
  1574. np.dtype(np.int32),
  1575. ):
  1576. dtype = np.dtype(np.int64)
  1577. convert = True
  1578. retyped_data.append((col, data[col].astype(dtype)))
  1579. if convert:
  1580. data = DataFrame.from_dict(dict(retyped_data))
  1581. if index_col is not None:
  1582. data = data.set_index(data.pop(index_col))
  1583. return data
  1584. def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
  1585. # Check for missing values, and replace if found
  1586. replacements = {}
  1587. for i in range(len(data.columns)):
  1588. fmt = self._typlist[i]
  1589. if fmt not in self.VALID_RANGE:
  1590. continue
  1591. fmt = cast(str, fmt) # only strs in VALID_RANGE
  1592. nmin, nmax = self.VALID_RANGE[fmt]
  1593. series = data.iloc[:, i]
  1594. # appreciably faster to do this with ndarray instead of Series
  1595. svals = series._values
  1596. missing = (svals < nmin) | (svals > nmax)
  1597. if not missing.any():
  1598. continue
  1599. if convert_missing: # Replacement follows Stata notation
  1600. missing_loc = np.nonzero(np.asarray(missing))[0]
  1601. umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
  1602. replacement = Series(series, dtype=object)
  1603. for j, um in enumerate(umissing):
  1604. missing_value = StataMissingValue(um)
  1605. loc = missing_loc[umissing_loc == j]
  1606. replacement.iloc[loc] = missing_value
  1607. else: # All replacements are identical
  1608. dtype = series.dtype
  1609. if dtype not in (np.float32, np.float64):
  1610. dtype = np.float64
  1611. replacement = Series(series, dtype=dtype)
  1612. if not replacement._values.flags["WRITEABLE"]:
  1613. # only relevant for ArrayManager; construction
  1614. # path for BlockManager ensures writeability
  1615. replacement = replacement.copy()
  1616. # Note: operating on ._values is much faster than directly
  1617. # TODO: can we fix that?
  1618. replacement._values[missing] = np.nan
  1619. replacements[i] = replacement
  1620. if replacements:
  1621. for idx, value in replacements.items():
  1622. data.isetitem(idx, value)
  1623. return data
  1624. def _insert_strls(self, data: DataFrame) -> DataFrame:
  1625. if not hasattr(self, "GSO") or len(self.GSO) == 0:
  1626. return data
  1627. for i, typ in enumerate(self._typlist):
  1628. if typ != "Q":
  1629. continue
  1630. # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
  1631. data.isetitem(i, [self.GSO[str(k)] for k in data.iloc[:, i]])
  1632. return data
  1633. def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame:
  1634. if not self._column_selector_set:
  1635. column_set = set(columns)
  1636. if len(column_set) != len(columns):
  1637. raise ValueError("columns contains duplicate entries")
  1638. unmatched = column_set.difference(data.columns)
  1639. if unmatched:
  1640. joined = ", ".join(list(unmatched))
  1641. raise ValueError(
  1642. "The following columns were not "
  1643. f"found in the Stata data set: {joined}"
  1644. )
  1645. # Copy information for retained columns for later processing
  1646. dtyplist = []
  1647. typlist = []
  1648. fmtlist = []
  1649. lbllist = []
  1650. for col in columns:
  1651. i = data.columns.get_loc(col)
  1652. dtyplist.append(self._dtyplist[i])
  1653. typlist.append(self._typlist[i])
  1654. fmtlist.append(self._fmtlist[i])
  1655. lbllist.append(self._lbllist[i])
  1656. self._dtyplist = dtyplist
  1657. self._typlist = typlist
  1658. self._fmtlist = fmtlist
  1659. self._lbllist = lbllist
  1660. self._column_selector_set = True
  1661. return data[columns]
  1662. def _do_convert_categoricals(
  1663. self,
  1664. data: DataFrame,
  1665. value_label_dict: dict[str, dict[float, str]],
  1666. lbllist: Sequence[str],
  1667. order_categoricals: bool,
  1668. ) -> DataFrame:
  1669. """
  1670. Converts categorical columns to Categorical type.
  1671. """
  1672. if not value_label_dict:
  1673. return data
  1674. cat_converted_data = []
  1675. for col, label in zip(data, lbllist):
  1676. if label in value_label_dict:
  1677. # Explicit call with ordered=True
  1678. vl = value_label_dict[label]
  1679. keys = np.array(list(vl.keys()))
  1680. column = data[col]
  1681. key_matches = column.isin(keys)
  1682. if self._using_iterator and key_matches.all():
  1683. initial_categories: np.ndarray | None = keys
  1684. # If all categories are in the keys and we are iterating,
  1685. # use the same keys for all chunks. If some are missing
  1686. # value labels, then we will fall back to the categories
  1687. # varying across chunks.
  1688. else:
  1689. if self._using_iterator:
  1690. # warn is using an iterator
  1691. warnings.warn(
  1692. categorical_conversion_warning,
  1693. CategoricalConversionWarning,
  1694. stacklevel=find_stack_level(),
  1695. )
  1696. initial_categories = None
  1697. cat_data = Categorical(
  1698. column, categories=initial_categories, ordered=order_categoricals
  1699. )
  1700. if initial_categories is None:
  1701. # If None here, then we need to match the cats in the Categorical
  1702. categories = []
  1703. for category in cat_data.categories:
  1704. if category in vl:
  1705. categories.append(vl[category])
  1706. else:
  1707. categories.append(category)
  1708. else:
  1709. # If all cats are matched, we can use the values
  1710. categories = list(vl.values())
  1711. try:
  1712. # Try to catch duplicate categories
  1713. # TODO: if we get a non-copying rename_categories, use that
  1714. cat_data = cat_data.rename_categories(categories)
  1715. except ValueError as err:
  1716. vc = Series(categories, copy=False).value_counts()
  1717. repeated_cats = list(vc.index[vc > 1])
  1718. repeats = "-" * 80 + "\n" + "\n".join(repeated_cats)
  1719. # GH 25772
  1720. msg = f"""
  1721. Value labels for column {col} are not unique. These cannot be converted to
  1722. pandas categoricals.
  1723. Either read the file with `convert_categoricals` set to False or use the
  1724. low level interface in `StataReader` to separately read the values and the
  1725. value_labels.
  1726. The repeated labels are:
  1727. {repeats}
  1728. """
  1729. raise ValueError(msg) from err
  1730. # TODO: is the next line needed above in the data(...) method?
  1731. cat_series = Series(cat_data, index=data.index, copy=False)
  1732. cat_converted_data.append((col, cat_series))
  1733. else:
  1734. cat_converted_data.append((col, data[col]))
  1735. data = DataFrame(dict(cat_converted_data), copy=False)
  1736. return data
  1737. @property
  1738. def data_label(self) -> str:
  1739. """
  1740. Return data label of Stata file.
  1741. Examples
  1742. --------
  1743. >>> df = pd.DataFrame([(1,)], columns=["variable"])
  1744. >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)
  1745. >>> data_label = "This is a data file."
  1746. >>> path = "/My_path/filename.dta"
  1747. >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP
  1748. ... data_label=data_label, # doctest: +SKIP
  1749. ... version=None) # doctest: +SKIP
  1750. >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP
  1751. ... print(reader.data_label) # doctest: +SKIP
  1752. This is a data file.
  1753. """
  1754. self._ensure_open()
  1755. return self._data_label
  1756. @property
  1757. def time_stamp(self) -> str:
  1758. """
  1759. Return time stamp of Stata file.
  1760. """
  1761. self._ensure_open()
  1762. return self._time_stamp
  1763. def variable_labels(self) -> dict[str, str]:
  1764. """
  1765. Return a dict associating each variable name with corresponding label.
  1766. Returns
  1767. -------
  1768. dict
  1769. Examples
  1770. --------
  1771. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"])
  1772. >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)
  1773. >>> path = "/My_path/filename.dta"
  1774. >>> variable_labels = {"col_1": "This is an example"}
  1775. >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP
  1776. ... variable_labels=variable_labels, version=None) # doctest: +SKIP
  1777. >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP
  1778. ... print(reader.variable_labels()) # doctest: +SKIP
  1779. {'index': '', 'col_1': 'This is an example', 'col_2': ''}
  1780. >>> pd.read_stata(path) # doctest: +SKIP
  1781. index col_1 col_2
  1782. 0 0 1 2
  1783. 1 1 3 4
  1784. """
  1785. self._ensure_open()
  1786. return dict(zip(self._varlist, self._variable_labels))
  1787. def value_labels(self) -> dict[str, dict[float, str]]:
  1788. """
  1789. Return a nested dict associating each variable name to its value and label.
  1790. Returns
  1791. -------
  1792. dict
  1793. Examples
  1794. --------
  1795. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"])
  1796. >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)
  1797. >>> path = "/My_path/filename.dta"
  1798. >>> value_labels = {"col_1": {3: "x"}}
  1799. >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP
  1800. ... value_labels=value_labels, version=None) # doctest: +SKIP
  1801. >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP
  1802. ... print(reader.value_labels()) # doctest: +SKIP
  1803. {'col_1': {3: 'x'}}
  1804. >>> pd.read_stata(path) # doctest: +SKIP
  1805. index col_1 col_2
  1806. 0 0 1 2
  1807. 1 1 x 4
  1808. """
  1809. if not self._value_labels_read:
  1810. self._read_value_labels()
  1811. return self._value_label_dict
  1812. @Appender(_read_stata_doc)
  1813. def read_stata(
  1814. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  1815. *,
  1816. convert_dates: bool = True,
  1817. convert_categoricals: bool = True,
  1818. index_col: str | None = None,
  1819. convert_missing: bool = False,
  1820. preserve_dtypes: bool = True,
  1821. columns: Sequence[str] | None = None,
  1822. order_categoricals: bool = True,
  1823. chunksize: int | None = None,
  1824. iterator: bool = False,
  1825. compression: CompressionOptions = "infer",
  1826. storage_options: StorageOptions | None = None,
  1827. ) -> DataFrame | StataReader:
  1828. reader = StataReader(
  1829. filepath_or_buffer,
  1830. convert_dates=convert_dates,
  1831. convert_categoricals=convert_categoricals,
  1832. index_col=index_col,
  1833. convert_missing=convert_missing,
  1834. preserve_dtypes=preserve_dtypes,
  1835. columns=columns,
  1836. order_categoricals=order_categoricals,
  1837. chunksize=chunksize,
  1838. storage_options=storage_options,
  1839. compression=compression,
  1840. )
  1841. if iterator or chunksize:
  1842. return reader
  1843. with reader:
  1844. return reader.read()
  1845. def _set_endianness(endianness: str) -> str:
  1846. if endianness.lower() in ["<", "little"]:
  1847. return "<"
  1848. elif endianness.lower() in [">", "big"]:
  1849. return ">"
  1850. else: # pragma : no cover
  1851. raise ValueError(f"Endianness {endianness} not understood")
  1852. def _pad_bytes(name: AnyStr, length: int) -> AnyStr:
  1853. """
  1854. Take a char string and pads it with null bytes until it's length chars.
  1855. """
  1856. if isinstance(name, bytes):
  1857. return name + b"\x00" * (length - len(name))
  1858. return name + "\x00" * (length - len(name))
  1859. def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
  1860. """
  1861. Convert from one of the stata date formats to a type in TYPE_MAP.
  1862. """
  1863. if fmt in [
  1864. "tc",
  1865. "%tc",
  1866. "td",
  1867. "%td",
  1868. "tw",
  1869. "%tw",
  1870. "tm",
  1871. "%tm",
  1872. "tq",
  1873. "%tq",
  1874. "th",
  1875. "%th",
  1876. "ty",
  1877. "%ty",
  1878. ]:
  1879. return np.dtype(np.float64) # Stata expects doubles for SIFs
  1880. else:
  1881. raise NotImplementedError(f"Format {fmt} not implemented")
  1882. def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict:
  1883. new_dict = {}
  1884. for key in convert_dates:
  1885. if not convert_dates[key].startswith("%"): # make sure proper fmts
  1886. convert_dates[key] = "%" + convert_dates[key]
  1887. if key in varlist:
  1888. new_dict.update({varlist.index(key): convert_dates[key]})
  1889. else:
  1890. if not isinstance(key, int):
  1891. raise ValueError("convert_dates key must be a column or an integer")
  1892. new_dict.update({key: convert_dates[key]})
  1893. return new_dict
  1894. def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int:
  1895. """
  1896. Convert dtype types to stata types. Returns the byte of the given ordinal.
  1897. See TYPE_MAP and comments for an explanation. This is also explained in
  1898. the dta spec.
  1899. 1 - 244 are strings of this length
  1900. Pandas Stata
  1901. 251 - for int8 byte
  1902. 252 - for int16 int
  1903. 253 - for int32 long
  1904. 254 - for float32 float
  1905. 255 - for double double
  1906. If there are dates to convert, then dtype will already have the correct
  1907. type inserted.
  1908. """
  1909. # TODO: expand to handle datetime to integer conversion
  1910. if dtype.type is np.object_: # try to coerce it to the biggest string
  1911. # not memory efficient, what else could we
  1912. # do?
  1913. itemsize = max_len_string_array(ensure_object(column._values))
  1914. return max(itemsize, 1)
  1915. elif dtype.type is np.float64:
  1916. return 255
  1917. elif dtype.type is np.float32:
  1918. return 254
  1919. elif dtype.type is np.int32:
  1920. return 253
  1921. elif dtype.type is np.int16:
  1922. return 252
  1923. elif dtype.type is np.int8:
  1924. return 251
  1925. else: # pragma : no cover
  1926. raise NotImplementedError(f"Data type {dtype} not supported.")
  1927. def _dtype_to_default_stata_fmt(
  1928. dtype, column: Series, dta_version: int = 114, force_strl: bool = False
  1929. ) -> str:
  1930. """
  1931. Map numpy dtype to stata's default format for this type. Not terribly
  1932. important since users can change this in Stata. Semantics are
  1933. object -> "%DDs" where DD is the length of the string. If not a string,
  1934. raise ValueError
  1935. float64 -> "%10.0g"
  1936. float32 -> "%9.0g"
  1937. int64 -> "%9.0g"
  1938. int32 -> "%12.0g"
  1939. int16 -> "%8.0g"
  1940. int8 -> "%8.0g"
  1941. strl -> "%9s"
  1942. """
  1943. # TODO: Refactor to combine type with format
  1944. # TODO: expand this to handle a default datetime format?
  1945. if dta_version < 117:
  1946. max_str_len = 244
  1947. else:
  1948. max_str_len = 2045
  1949. if force_strl:
  1950. return "%9s"
  1951. if dtype.type is np.object_:
  1952. itemsize = max_len_string_array(ensure_object(column._values))
  1953. if itemsize > max_str_len:
  1954. if dta_version >= 117:
  1955. return "%9s"
  1956. else:
  1957. raise ValueError(excessive_string_length_error.format(column.name))
  1958. return "%" + str(max(itemsize, 1)) + "s"
  1959. elif dtype == np.float64:
  1960. return "%10.0g"
  1961. elif dtype == np.float32:
  1962. return "%9.0g"
  1963. elif dtype == np.int32:
  1964. return "%12.0g"
  1965. elif dtype in (np.int8, np.int16):
  1966. return "%8.0g"
  1967. else: # pragma : no cover
  1968. raise NotImplementedError(f"Data type {dtype} not supported.")
  1969. @doc(
  1970. storage_options=_shared_docs["storage_options"],
  1971. compression_options=_shared_docs["compression_options"] % "fname",
  1972. )
  1973. class StataWriter(StataParser):
  1974. """
  1975. A class for writing Stata binary dta files
  1976. Parameters
  1977. ----------
  1978. fname : path (string), buffer or path object
  1979. string, path object (pathlib.Path or py._path.local.LocalPath) or
  1980. object implementing a binary write() functions. If using a buffer
  1981. then the buffer will not be automatically closed after the file
  1982. is written.
  1983. data : DataFrame
  1984. Input to save
  1985. convert_dates : dict
  1986. Dictionary mapping columns containing datetime types to stata internal
  1987. format to use when writing the dates. Options are 'tc', 'td', 'tm',
  1988. 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
  1989. Datetime columns that do not have a conversion type specified will be
  1990. converted to 'tc'. Raises NotImplementedError if a datetime column has
  1991. timezone information
  1992. write_index : bool
  1993. Write the index to Stata dataset.
  1994. byteorder : str
  1995. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  1996. time_stamp : datetime
  1997. A datetime to use as file creation date. Default is the current time
  1998. data_label : str
  1999. A label for the data set. Must be 80 characters or smaller.
  2000. variable_labels : dict
  2001. Dictionary containing columns as keys and variable labels as values.
  2002. Each label must be 80 characters or smaller.
  2003. {compression_options}
  2004. .. versionchanged:: 1.4.0 Zstandard support.
  2005. {storage_options}
  2006. value_labels : dict of dicts
  2007. Dictionary containing columns as keys and dictionaries of column value
  2008. to labels as values. The combined length of all labels for a single
  2009. variable must be 32,000 characters or smaller.
  2010. .. versionadded:: 1.4.0
  2011. Returns
  2012. -------
  2013. writer : StataWriter instance
  2014. The StataWriter instance has a write_file method, which will
  2015. write the file to the given `fname`.
  2016. Raises
  2017. ------
  2018. NotImplementedError
  2019. * If datetimes contain timezone information
  2020. ValueError
  2021. * Columns listed in convert_dates are neither datetime64[ns]
  2022. or datetime
  2023. * Column dtype is not representable in Stata
  2024. * Column listed in convert_dates is not in DataFrame
  2025. * Categorical label contains more than 32,000 characters
  2026. Examples
  2027. --------
  2028. >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])
  2029. >>> writer = StataWriter('./data_file.dta', data)
  2030. >>> writer.write_file()
  2031. Directly write a zip file
  2032. >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}}
  2033. >>> writer = StataWriter('./data_file.zip', data, compression=compression)
  2034. >>> writer.write_file()
  2035. Save a DataFrame with dates
  2036. >>> from datetime import datetime
  2037. >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])
  2038. >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}})
  2039. >>> writer.write_file()
  2040. """
  2041. _max_string_length = 244
  2042. _encoding: Literal["latin-1", "utf-8"] = "latin-1"
  2043. def __init__(
  2044. self,
  2045. fname: FilePath | WriteBuffer[bytes],
  2046. data: DataFrame,
  2047. convert_dates: dict[Hashable, str] | None = None,
  2048. write_index: bool = True,
  2049. byteorder: str | None = None,
  2050. time_stamp: datetime | None = None,
  2051. data_label: str | None = None,
  2052. variable_labels: dict[Hashable, str] | None = None,
  2053. compression: CompressionOptions = "infer",
  2054. storage_options: StorageOptions | None = None,
  2055. *,
  2056. value_labels: dict[Hashable, dict[float, str]] | None = None,
  2057. ) -> None:
  2058. super().__init__()
  2059. self.data = data
  2060. self._convert_dates = {} if convert_dates is None else convert_dates
  2061. self._write_index = write_index
  2062. self._time_stamp = time_stamp
  2063. self._data_label = data_label
  2064. self._variable_labels = variable_labels
  2065. self._non_cat_value_labels = value_labels
  2066. self._value_labels: list[StataValueLabel] = []
  2067. self._has_value_labels = np.array([], dtype=bool)
  2068. self._compression = compression
  2069. self._output_file: IO[bytes] | None = None
  2070. self._converted_names: dict[Hashable, str] = {}
  2071. # attach nobs, nvars, data, varlist, typlist
  2072. self._prepare_pandas(data)
  2073. self.storage_options = storage_options
  2074. if byteorder is None:
  2075. byteorder = sys.byteorder
  2076. self._byteorder = _set_endianness(byteorder)
  2077. self._fname = fname
  2078. self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
  2079. def _write(self, to_write: str) -> None:
  2080. """
  2081. Helper to call encode before writing to file for Python 3 compat.
  2082. """
  2083. self.handles.handle.write(to_write.encode(self._encoding))
  2084. def _write_bytes(self, value: bytes) -> None:
  2085. """
  2086. Helper to assert file is open before writing.
  2087. """
  2088. self.handles.handle.write(value)
  2089. def _prepare_non_cat_value_labels(
  2090. self, data: DataFrame
  2091. ) -> list[StataNonCatValueLabel]:
  2092. """
  2093. Check for value labels provided for non-categorical columns. Value
  2094. labels
  2095. """
  2096. non_cat_value_labels: list[StataNonCatValueLabel] = []
  2097. if self._non_cat_value_labels is None:
  2098. return non_cat_value_labels
  2099. for labname, labels in self._non_cat_value_labels.items():
  2100. if labname in self._converted_names:
  2101. colname = self._converted_names[labname]
  2102. elif labname in data.columns:
  2103. colname = str(labname)
  2104. else:
  2105. raise KeyError(
  2106. f"Can't create value labels for {labname}, it wasn't "
  2107. "found in the dataset."
  2108. )
  2109. if not is_numeric_dtype(data[colname].dtype):
  2110. # Labels should not be passed explicitly for categorical
  2111. # columns that will be converted to int
  2112. raise ValueError(
  2113. f"Can't create value labels for {labname}, value labels "
  2114. "can only be applied to numeric columns."
  2115. )
  2116. svl = StataNonCatValueLabel(colname, labels, self._encoding)
  2117. non_cat_value_labels.append(svl)
  2118. return non_cat_value_labels
  2119. def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
  2120. """
  2121. Check for categorical columns, retain categorical information for
  2122. Stata file and convert categorical data to int
  2123. """
  2124. is_cat = [isinstance(dtype, CategoricalDtype) for dtype in data.dtypes]
  2125. if not any(is_cat):
  2126. return data
  2127. self._has_value_labels |= np.array(is_cat)
  2128. get_base_missing_value = StataMissingValue.get_base_missing_value
  2129. data_formatted = []
  2130. for col, col_is_cat in zip(data, is_cat):
  2131. if col_is_cat:
  2132. svl = StataValueLabel(data[col], encoding=self._encoding)
  2133. self._value_labels.append(svl)
  2134. dtype = data[col].cat.codes.dtype
  2135. if dtype == np.int64:
  2136. raise ValueError(
  2137. "It is not possible to export "
  2138. "int64-based categorical data to Stata."
  2139. )
  2140. values = data[col].cat.codes._values.copy()
  2141. # Upcast if needed so that correct missing values can be set
  2142. if values.max() >= get_base_missing_value(dtype):
  2143. if dtype == np.int8:
  2144. dtype = np.dtype(np.int16)
  2145. elif dtype == np.int16:
  2146. dtype = np.dtype(np.int32)
  2147. else:
  2148. dtype = np.dtype(np.float64)
  2149. values = np.array(values, dtype=dtype)
  2150. # Replace missing values with Stata missing value for type
  2151. values[values == -1] = get_base_missing_value(dtype)
  2152. data_formatted.append((col, values))
  2153. else:
  2154. data_formatted.append((col, data[col]))
  2155. return DataFrame.from_dict(dict(data_formatted))
  2156. def _replace_nans(self, data: DataFrame) -> DataFrame:
  2157. # return data
  2158. """
  2159. Checks floating point data columns for nans, and replaces these with
  2160. the generic Stata for missing value (.)
  2161. """
  2162. for c in data:
  2163. dtype = data[c].dtype
  2164. if dtype in (np.float32, np.float64):
  2165. if dtype == np.float32:
  2166. replacement = self.MISSING_VALUES["f"]
  2167. else:
  2168. replacement = self.MISSING_VALUES["d"]
  2169. data[c] = data[c].fillna(replacement)
  2170. return data
  2171. def _update_strl_names(self) -> None:
  2172. """No-op, forward compatibility"""
  2173. def _validate_variable_name(self, name: str) -> str:
  2174. """
  2175. Validate variable names for Stata export.
  2176. Parameters
  2177. ----------
  2178. name : str
  2179. Variable name
  2180. Returns
  2181. -------
  2182. str
  2183. The validated name with invalid characters replaced with
  2184. underscores.
  2185. Notes
  2186. -----
  2187. Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9
  2188. and _.
  2189. """
  2190. for c in name:
  2191. if (
  2192. (c < "A" or c > "Z")
  2193. and (c < "a" or c > "z")
  2194. and (c < "0" or c > "9")
  2195. and c != "_"
  2196. ):
  2197. name = name.replace(c, "_")
  2198. return name
  2199. def _check_column_names(self, data: DataFrame) -> DataFrame:
  2200. """
  2201. Checks column names to ensure that they are valid Stata column names.
  2202. This includes checks for:
  2203. * Non-string names
  2204. * Stata keywords
  2205. * Variables that start with numbers
  2206. * Variables with names that are too long
  2207. When an illegal variable name is detected, it is converted, and if
  2208. dates are exported, the variable name is propagated to the date
  2209. conversion dictionary
  2210. """
  2211. converted_names: dict[Hashable, str] = {}
  2212. columns = list(data.columns)
  2213. original_columns = columns[:]
  2214. duplicate_var_id = 0
  2215. for j, name in enumerate(columns):
  2216. orig_name = name
  2217. if not isinstance(name, str):
  2218. name = str(name)
  2219. name = self._validate_variable_name(name)
  2220. # Variable name must not be a reserved word
  2221. if name in self.RESERVED_WORDS:
  2222. name = "_" + name
  2223. # Variable name may not start with a number
  2224. if "0" <= name[0] <= "9":
  2225. name = "_" + name
  2226. name = name[: min(len(name), 32)]
  2227. if not name == orig_name:
  2228. # check for duplicates
  2229. while columns.count(name) > 0:
  2230. # prepend ascending number to avoid duplicates
  2231. name = "_" + str(duplicate_var_id) + name
  2232. name = name[: min(len(name), 32)]
  2233. duplicate_var_id += 1
  2234. converted_names[orig_name] = name
  2235. columns[j] = name
  2236. data.columns = Index(columns)
  2237. # Check date conversion, and fix key if needed
  2238. if self._convert_dates:
  2239. for c, o in zip(columns, original_columns):
  2240. if c != o:
  2241. self._convert_dates[c] = self._convert_dates[o]
  2242. del self._convert_dates[o]
  2243. if converted_names:
  2244. conversion_warning = []
  2245. for orig_name, name in converted_names.items():
  2246. msg = f"{orig_name} -> {name}"
  2247. conversion_warning.append(msg)
  2248. ws = invalid_name_doc.format("\n ".join(conversion_warning))
  2249. warnings.warn(
  2250. ws,
  2251. InvalidColumnName,
  2252. stacklevel=find_stack_level(),
  2253. )
  2254. self._converted_names = converted_names
  2255. self._update_strl_names()
  2256. return data
  2257. def _set_formats_and_types(self, dtypes: Series) -> None:
  2258. self.fmtlist: list[str] = []
  2259. self.typlist: list[int] = []
  2260. for col, dtype in dtypes.items():
  2261. self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col]))
  2262. self.typlist.append(_dtype_to_stata_type(dtype, self.data[col]))
  2263. def _prepare_pandas(self, data: DataFrame) -> None:
  2264. # NOTE: we might need a different API / class for pandas objects so
  2265. # we can set different semantics - handle this with a PR to pandas.io
  2266. data = data.copy()
  2267. if self._write_index:
  2268. temp = data.reset_index()
  2269. if isinstance(temp, DataFrame):
  2270. data = temp
  2271. # Ensure column names are strings
  2272. data = self._check_column_names(data)
  2273. # Check columns for compatibility with stata, upcast if necessary
  2274. # Raise if outside the supported range
  2275. data = _cast_to_stata_types(data)
  2276. # Replace NaNs with Stata missing values
  2277. data = self._replace_nans(data)
  2278. # Set all columns to initially unlabelled
  2279. self._has_value_labels = np.repeat(False, data.shape[1])
  2280. # Create value labels for non-categorical data
  2281. non_cat_value_labels = self._prepare_non_cat_value_labels(data)
  2282. non_cat_columns = [svl.labname for svl in non_cat_value_labels]
  2283. has_non_cat_val_labels = data.columns.isin(non_cat_columns)
  2284. self._has_value_labels |= has_non_cat_val_labels
  2285. self._value_labels.extend(non_cat_value_labels)
  2286. # Convert categoricals to int data, and strip labels
  2287. data = self._prepare_categoricals(data)
  2288. self.nobs, self.nvar = data.shape
  2289. self.data = data
  2290. self.varlist = data.columns.tolist()
  2291. dtypes = data.dtypes
  2292. # Ensure all date columns are converted
  2293. for col in data:
  2294. if col in self._convert_dates:
  2295. continue
  2296. if lib.is_np_dtype(data[col].dtype, "M"):
  2297. self._convert_dates[col] = "tc"
  2298. self._convert_dates = _maybe_convert_to_int_keys(
  2299. self._convert_dates, self.varlist
  2300. )
  2301. for key in self._convert_dates:
  2302. new_type = _convert_datetime_to_stata_type(self._convert_dates[key])
  2303. dtypes.iloc[key] = np.dtype(new_type)
  2304. # Verify object arrays are strings and encode to bytes
  2305. self._encode_strings()
  2306. self._set_formats_and_types(dtypes)
  2307. # set the given format for the datetime cols
  2308. if self._convert_dates is not None:
  2309. for key in self._convert_dates:
  2310. if isinstance(key, int):
  2311. self.fmtlist[key] = self._convert_dates[key]
  2312. def _encode_strings(self) -> None:
  2313. """
  2314. Encode strings in dta-specific encoding
  2315. Do not encode columns marked for date conversion or for strL
  2316. conversion. The strL converter independently handles conversion and
  2317. also accepts empty string arrays.
  2318. """
  2319. convert_dates = self._convert_dates
  2320. # _convert_strl is not available in dta 114
  2321. convert_strl = getattr(self, "_convert_strl", [])
  2322. for i, col in enumerate(self.data):
  2323. # Skip columns marked for date conversion or strl conversion
  2324. if i in convert_dates or col in convert_strl:
  2325. continue
  2326. column = self.data[col]
  2327. dtype = column.dtype
  2328. # TODO could also handle string dtype here specifically
  2329. if dtype.type is np.object_:
  2330. inferred_dtype = infer_dtype(column, skipna=True)
  2331. if not ((inferred_dtype == "string") or len(column) == 0):
  2332. col = column.name
  2333. raise ValueError(
  2334. f"""\
  2335. Column `{col}` cannot be exported.\n\nOnly string-like object arrays
  2336. containing all strings or a mix of strings and None can be exported.
  2337. Object arrays containing only null values are prohibited. Other object
  2338. types cannot be exported and must first be converted to one of the
  2339. supported types."""
  2340. )
  2341. encoded = self.data[col].str.encode(self._encoding)
  2342. # If larger than _max_string_length do nothing
  2343. if (
  2344. max_len_string_array(ensure_object(encoded._values))
  2345. <= self._max_string_length
  2346. ):
  2347. self.data[col] = encoded
  2348. def write_file(self) -> None:
  2349. """
  2350. Export DataFrame object to Stata dta format.
  2351. Examples
  2352. --------
  2353. >>> df = pd.DataFrame({"fully_labelled": [1, 2, 3, 3, 1],
  2354. ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan],
  2355. ... "Y": [7, 7, 9, 8, 10],
  2356. ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]),
  2357. ... })
  2358. >>> path = "/My_path/filename.dta"
  2359. >>> labels = {"fully_labelled": {1: "one", 2: "two", 3: "three"},
  2360. ... "partially_labelled": {1.0: "one", 2.0: "two"},
  2361. ... }
  2362. >>> writer = pd.io.stata.StataWriter(path,
  2363. ... df,
  2364. ... value_labels=labels) # doctest: +SKIP
  2365. >>> writer.write_file() # doctest: +SKIP
  2366. >>> df = pd.read_stata(path) # doctest: +SKIP
  2367. >>> df # doctest: +SKIP
  2368. index fully_labelled partially_labeled Y Z
  2369. 0 0 one one 7 j
  2370. 1 1 two two 7 k
  2371. 2 2 three NaN 9 l
  2372. 3 3 three 9.0 8 k
  2373. 4 4 one NaN 10 j
  2374. """
  2375. with get_handle(
  2376. self._fname,
  2377. "wb",
  2378. compression=self._compression,
  2379. is_text=False,
  2380. storage_options=self.storage_options,
  2381. ) as self.handles:
  2382. if self.handles.compression["method"] is not None:
  2383. # ZipFile creates a file (with the same name) for each write call.
  2384. # Write it first into a buffer and then write the buffer to the ZipFile.
  2385. self._output_file, self.handles.handle = self.handles.handle, BytesIO()
  2386. self.handles.created_handles.append(self.handles.handle)
  2387. try:
  2388. self._write_header(
  2389. data_label=self._data_label, time_stamp=self._time_stamp
  2390. )
  2391. self._write_map()
  2392. self._write_variable_types()
  2393. self._write_varnames()
  2394. self._write_sortlist()
  2395. self._write_formats()
  2396. self._write_value_label_names()
  2397. self._write_variable_labels()
  2398. self._write_expansion_fields()
  2399. self._write_characteristics()
  2400. records = self._prepare_data()
  2401. self._write_data(records)
  2402. self._write_strls()
  2403. self._write_value_labels()
  2404. self._write_file_close_tag()
  2405. self._write_map()
  2406. self._close()
  2407. except Exception as exc:
  2408. self.handles.close()
  2409. if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile(
  2410. self._fname
  2411. ):
  2412. try:
  2413. os.unlink(self._fname)
  2414. except OSError:
  2415. warnings.warn(
  2416. f"This save was not successful but {self._fname} could not "
  2417. "be deleted. This file is not valid.",
  2418. ResourceWarning,
  2419. stacklevel=find_stack_level(),
  2420. )
  2421. raise exc
  2422. def _close(self) -> None:
  2423. """
  2424. Close the file if it was created by the writer.
  2425. If a buffer or file-like object was passed in, for example a GzipFile,
  2426. then leave this file open for the caller to close.
  2427. """
  2428. # write compression
  2429. if self._output_file is not None:
  2430. assert isinstance(self.handles.handle, BytesIO)
  2431. bio, self.handles.handle = self.handles.handle, self._output_file
  2432. self.handles.handle.write(bio.getvalue())
  2433. def _write_map(self) -> None:
  2434. """No-op, future compatibility"""
  2435. def _write_file_close_tag(self) -> None:
  2436. """No-op, future compatibility"""
  2437. def _write_characteristics(self) -> None:
  2438. """No-op, future compatibility"""
  2439. def _write_strls(self) -> None:
  2440. """No-op, future compatibility"""
  2441. def _write_expansion_fields(self) -> None:
  2442. """Write 5 zeros for expansion fields"""
  2443. self._write(_pad_bytes("", 5))
  2444. def _write_value_labels(self) -> None:
  2445. for vl in self._value_labels:
  2446. self._write_bytes(vl.generate_value_label(self._byteorder))
  2447. def _write_header(
  2448. self,
  2449. data_label: str | None = None,
  2450. time_stamp: datetime | None = None,
  2451. ) -> None:
  2452. byteorder = self._byteorder
  2453. # ds_format - just use 114
  2454. self._write_bytes(struct.pack("b", 114))
  2455. # byteorder
  2456. self._write(byteorder == ">" and "\x01" or "\x02")
  2457. # filetype
  2458. self._write("\x01")
  2459. # unused
  2460. self._write("\x00")
  2461. # number of vars, 2 bytes
  2462. self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2])
  2463. # number of obs, 4 bytes
  2464. self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4])
  2465. # data label 81 bytes, char, null terminated
  2466. if data_label is None:
  2467. self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80)))
  2468. else:
  2469. self._write_bytes(
  2470. self._null_terminate_bytes(_pad_bytes(data_label[:80], 80))
  2471. )
  2472. # time stamp, 18 bytes, char, null terminated
  2473. # format dd Mon yyyy hh:mm
  2474. if time_stamp is None:
  2475. time_stamp = datetime.now()
  2476. elif not isinstance(time_stamp, datetime):
  2477. raise ValueError("time_stamp should be datetime type")
  2478. # GH #13856
  2479. # Avoid locale-specific month conversion
  2480. months = [
  2481. "Jan",
  2482. "Feb",
  2483. "Mar",
  2484. "Apr",
  2485. "May",
  2486. "Jun",
  2487. "Jul",
  2488. "Aug",
  2489. "Sep",
  2490. "Oct",
  2491. "Nov",
  2492. "Dec",
  2493. ]
  2494. month_lookup = {i + 1: month for i, month in enumerate(months)}
  2495. ts = (
  2496. time_stamp.strftime("%d ")
  2497. + month_lookup[time_stamp.month]
  2498. + time_stamp.strftime(" %Y %H:%M")
  2499. )
  2500. self._write_bytes(self._null_terminate_bytes(ts))
  2501. def _write_variable_types(self) -> None:
  2502. for typ in self.typlist:
  2503. self._write_bytes(struct.pack("B", typ))
  2504. def _write_varnames(self) -> None:
  2505. # varlist names are checked by _check_column_names
  2506. # varlist, requires null terminated
  2507. for name in self.varlist:
  2508. name = self._null_terminate_str(name)
  2509. name = _pad_bytes(name[:32], 33)
  2510. self._write(name)
  2511. def _write_sortlist(self) -> None:
  2512. # srtlist, 2*(nvar+1), int array, encoded by byteorder
  2513. srtlist = _pad_bytes("", 2 * (self.nvar + 1))
  2514. self._write(srtlist)
  2515. def _write_formats(self) -> None:
  2516. # fmtlist, 49*nvar, char array
  2517. for fmt in self.fmtlist:
  2518. self._write(_pad_bytes(fmt, 49))
  2519. def _write_value_label_names(self) -> None:
  2520. # lbllist, 33*nvar, char array
  2521. for i in range(self.nvar):
  2522. # Use variable name when categorical
  2523. if self._has_value_labels[i]:
  2524. name = self.varlist[i]
  2525. name = self._null_terminate_str(name)
  2526. name = _pad_bytes(name[:32], 33)
  2527. self._write(name)
  2528. else: # Default is empty label
  2529. self._write(_pad_bytes("", 33))
  2530. def _write_variable_labels(self) -> None:
  2531. # Missing labels are 80 blank characters plus null termination
  2532. blank = _pad_bytes("", 81)
  2533. if self._variable_labels is None:
  2534. for i in range(self.nvar):
  2535. self._write(blank)
  2536. return
  2537. for col in self.data:
  2538. if col in self._variable_labels:
  2539. label = self._variable_labels[col]
  2540. if len(label) > 80:
  2541. raise ValueError("Variable labels must be 80 characters or fewer")
  2542. is_latin1 = all(ord(c) < 256 for c in label)
  2543. if not is_latin1:
  2544. raise ValueError(
  2545. "Variable labels must contain only characters that "
  2546. "can be encoded in Latin-1"
  2547. )
  2548. self._write(_pad_bytes(label, 81))
  2549. else:
  2550. self._write(blank)
  2551. def _convert_strls(self, data: DataFrame) -> DataFrame:
  2552. """No-op, future compatibility"""
  2553. return data
  2554. def _prepare_data(self) -> np.rec.recarray:
  2555. data = self.data
  2556. typlist = self.typlist
  2557. convert_dates = self._convert_dates
  2558. # 1. Convert dates
  2559. if self._convert_dates is not None:
  2560. for i, col in enumerate(data):
  2561. if i in convert_dates:
  2562. data[col] = _datetime_to_stata_elapsed_vec(
  2563. data[col], self.fmtlist[i]
  2564. )
  2565. # 2. Convert strls
  2566. data = self._convert_strls(data)
  2567. # 3. Convert bad string data to '' and pad to correct length
  2568. dtypes = {}
  2569. native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
  2570. for i, col in enumerate(data):
  2571. typ = typlist[i]
  2572. if typ <= self._max_string_length:
  2573. with warnings.catch_warnings():
  2574. warnings.filterwarnings(
  2575. "ignore",
  2576. "Downcasting object dtype arrays",
  2577. category=FutureWarning,
  2578. )
  2579. dc = data[col].fillna("")
  2580. data[col] = dc.apply(_pad_bytes, args=(typ,))
  2581. stype = f"S{typ}"
  2582. dtypes[col] = stype
  2583. data[col] = data[col].astype(stype)
  2584. else:
  2585. dtype = data[col].dtype
  2586. if not native_byteorder:
  2587. dtype = dtype.newbyteorder(self._byteorder)
  2588. dtypes[col] = dtype
  2589. return data.to_records(index=False, column_dtypes=dtypes)
  2590. def _write_data(self, records: np.rec.recarray) -> None:
  2591. self._write_bytes(records.tobytes())
  2592. @staticmethod
  2593. def _null_terminate_str(s: str) -> str:
  2594. s += "\x00"
  2595. return s
  2596. def _null_terminate_bytes(self, s: str) -> bytes:
  2597. return self._null_terminate_str(s).encode(self._encoding)
  2598. def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int:
  2599. """
  2600. Converts dtype types to stata types. Returns the byte of the given ordinal.
  2601. See TYPE_MAP and comments for an explanation. This is also explained in
  2602. the dta spec.
  2603. 1 - 2045 are strings of this length
  2604. Pandas Stata
  2605. 32768 - for object strL
  2606. 65526 - for int8 byte
  2607. 65527 - for int16 int
  2608. 65528 - for int32 long
  2609. 65529 - for float32 float
  2610. 65530 - for double double
  2611. If there are dates to convert, then dtype will already have the correct
  2612. type inserted.
  2613. """
  2614. # TODO: expand to handle datetime to integer conversion
  2615. if force_strl:
  2616. return 32768
  2617. if dtype.type is np.object_: # try to coerce it to the biggest string
  2618. # not memory efficient, what else could we
  2619. # do?
  2620. itemsize = max_len_string_array(ensure_object(column._values))
  2621. itemsize = max(itemsize, 1)
  2622. if itemsize <= 2045:
  2623. return itemsize
  2624. return 32768
  2625. elif dtype.type is np.float64:
  2626. return 65526
  2627. elif dtype.type is np.float32:
  2628. return 65527
  2629. elif dtype.type is np.int32:
  2630. return 65528
  2631. elif dtype.type is np.int16:
  2632. return 65529
  2633. elif dtype.type is np.int8:
  2634. return 65530
  2635. else: # pragma : no cover
  2636. raise NotImplementedError(f"Data type {dtype} not supported.")
  2637. def _pad_bytes_new(name: str | bytes, length: int) -> bytes:
  2638. """
  2639. Takes a bytes instance and pads it with null bytes until it's length chars.
  2640. """
  2641. if isinstance(name, str):
  2642. name = bytes(name, "utf-8")
  2643. return name + b"\x00" * (length - len(name))
  2644. class StataStrLWriter:
  2645. """
  2646. Converter for Stata StrLs
  2647. Stata StrLs map 8 byte values to strings which are stored using a
  2648. dictionary-like format where strings are keyed to two values.
  2649. Parameters
  2650. ----------
  2651. df : DataFrame
  2652. DataFrame to convert
  2653. columns : Sequence[str]
  2654. List of columns names to convert to StrL
  2655. version : int, optional
  2656. dta version. Currently supports 117, 118 and 119
  2657. byteorder : str, optional
  2658. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  2659. Notes
  2660. -----
  2661. Supports creation of the StrL block of a dta file for dta versions
  2662. 117, 118 and 119. These differ in how the GSO is stored. 118 and
  2663. 119 store the GSO lookup value as a uint32 and a uint64, while 117
  2664. uses two uint32s. 118 and 119 also encode all strings as unicode
  2665. which is required by the format. 117 uses 'latin-1' a fixed width
  2666. encoding that extends the 7-bit ascii table with an additional 128
  2667. characters.
  2668. """
  2669. def __init__(
  2670. self,
  2671. df: DataFrame,
  2672. columns: Sequence[str],
  2673. version: int = 117,
  2674. byteorder: str | None = None,
  2675. ) -> None:
  2676. if version not in (117, 118, 119):
  2677. raise ValueError("Only dta versions 117, 118 and 119 supported")
  2678. self._dta_ver = version
  2679. self.df = df
  2680. self.columns = columns
  2681. self._gso_table = {"": (0, 0)}
  2682. if byteorder is None:
  2683. byteorder = sys.byteorder
  2684. self._byteorder = _set_endianness(byteorder)
  2685. gso_v_type = "I" # uint32
  2686. gso_o_type = "Q" # uint64
  2687. self._encoding = "utf-8"
  2688. if version == 117:
  2689. o_size = 4
  2690. gso_o_type = "I" # 117 used uint32
  2691. self._encoding = "latin-1"
  2692. elif version == 118:
  2693. o_size = 6
  2694. else: # version == 119
  2695. o_size = 5
  2696. self._o_offet = 2 ** (8 * (8 - o_size))
  2697. self._gso_o_type = gso_o_type
  2698. self._gso_v_type = gso_v_type
  2699. def _convert_key(self, key: tuple[int, int]) -> int:
  2700. v, o = key
  2701. return v + self._o_offet * o
  2702. def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:
  2703. """
  2704. Generates the GSO lookup table for the DataFrame
  2705. Returns
  2706. -------
  2707. gso_table : dict
  2708. Ordered dictionary using the string found as keys
  2709. and their lookup position (v,o) as values
  2710. gso_df : DataFrame
  2711. DataFrame where strl columns have been converted to
  2712. (v,o) values
  2713. Notes
  2714. -----
  2715. Modifies the DataFrame in-place.
  2716. The DataFrame returned encodes the (v,o) values as uint64s. The
  2717. encoding depends on the dta version, and can be expressed as
  2718. enc = v + o * 2 ** (o_size * 8)
  2719. so that v is stored in the lower bits and o is in the upper
  2720. bits. o_size is
  2721. * 117: 4
  2722. * 118: 6
  2723. * 119: 5
  2724. """
  2725. gso_table = self._gso_table
  2726. gso_df = self.df
  2727. columns = list(gso_df.columns)
  2728. selected = gso_df[self.columns]
  2729. col_index = [(col, columns.index(col)) for col in self.columns]
  2730. keys = np.empty(selected.shape, dtype=np.uint64)
  2731. for o, (idx, row) in enumerate(selected.iterrows()):
  2732. for j, (col, v) in enumerate(col_index):
  2733. val = row[col]
  2734. # Allow columns with mixed str and None (GH 23633)
  2735. val = "" if val is None else val
  2736. key = gso_table.get(val, None)
  2737. if key is None:
  2738. # Stata prefers human numbers
  2739. key = (v + 1, o + 1)
  2740. gso_table[val] = key
  2741. keys[o, j] = self._convert_key(key)
  2742. for i, col in enumerate(self.columns):
  2743. gso_df[col] = keys[:, i]
  2744. return gso_table, gso_df
  2745. def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:
  2746. """
  2747. Generates the binary blob of GSOs that is written to the dta file.
  2748. Parameters
  2749. ----------
  2750. gso_table : dict
  2751. Ordered dictionary (str, vo)
  2752. Returns
  2753. -------
  2754. gso : bytes
  2755. Binary content of dta file to be placed between strl tags
  2756. Notes
  2757. -----
  2758. Output format depends on dta version. 117 uses two uint32s to
  2759. express v and o while 118+ uses a uint32 for v and a uint64 for o.
  2760. """
  2761. # Format information
  2762. # Length includes null term
  2763. # 117
  2764. # GSOvvvvooootllllxxxxxxxxxxxxxxx...x
  2765. # 3 u4 u4 u1 u4 string + null term
  2766. #
  2767. # 118, 119
  2768. # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x
  2769. # 3 u4 u8 u1 u4 string + null term
  2770. bio = BytesIO()
  2771. gso = bytes("GSO", "ascii")
  2772. gso_type = struct.pack(self._byteorder + "B", 130)
  2773. null = struct.pack(self._byteorder + "B", 0)
  2774. v_type = self._byteorder + self._gso_v_type
  2775. o_type = self._byteorder + self._gso_o_type
  2776. len_type = self._byteorder + "I"
  2777. for strl, vo in gso_table.items():
  2778. if vo == (0, 0):
  2779. continue
  2780. v, o = vo
  2781. # GSO
  2782. bio.write(gso)
  2783. # vvvv
  2784. bio.write(struct.pack(v_type, v))
  2785. # oooo / oooooooo
  2786. bio.write(struct.pack(o_type, o))
  2787. # t
  2788. bio.write(gso_type)
  2789. # llll
  2790. utf8_string = bytes(strl, "utf-8")
  2791. bio.write(struct.pack(len_type, len(utf8_string) + 1))
  2792. # xxx...xxx
  2793. bio.write(utf8_string)
  2794. bio.write(null)
  2795. return bio.getvalue()
  2796. class StataWriter117(StataWriter):
  2797. """
  2798. A class for writing Stata binary dta files in Stata 13 format (117)
  2799. Parameters
  2800. ----------
  2801. fname : path (string), buffer or path object
  2802. string, path object (pathlib.Path or py._path.local.LocalPath) or
  2803. object implementing a binary write() functions. If using a buffer
  2804. then the buffer will not be automatically closed after the file
  2805. is written.
  2806. data : DataFrame
  2807. Input to save
  2808. convert_dates : dict
  2809. Dictionary mapping columns containing datetime types to stata internal
  2810. format to use when writing the dates. Options are 'tc', 'td', 'tm',
  2811. 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
  2812. Datetime columns that do not have a conversion type specified will be
  2813. converted to 'tc'. Raises NotImplementedError if a datetime column has
  2814. timezone information
  2815. write_index : bool
  2816. Write the index to Stata dataset.
  2817. byteorder : str
  2818. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  2819. time_stamp : datetime
  2820. A datetime to use as file creation date. Default is the current time
  2821. data_label : str
  2822. A label for the data set. Must be 80 characters or smaller.
  2823. variable_labels : dict
  2824. Dictionary containing columns as keys and variable labels as values.
  2825. Each label must be 80 characters or smaller.
  2826. convert_strl : list
  2827. List of columns names to convert to Stata StrL format. Columns with
  2828. more than 2045 characters are automatically written as StrL.
  2829. Smaller columns can be converted by including the column name. Using
  2830. StrLs can reduce output file size when strings are longer than 8
  2831. characters, and either frequently repeated or sparse.
  2832. {compression_options}
  2833. .. versionchanged:: 1.4.0 Zstandard support.
  2834. value_labels : dict of dicts
  2835. Dictionary containing columns as keys and dictionaries of column value
  2836. to labels as values. The combined length of all labels for a single
  2837. variable must be 32,000 characters or smaller.
  2838. .. versionadded:: 1.4.0
  2839. Returns
  2840. -------
  2841. writer : StataWriter117 instance
  2842. The StataWriter117 instance has a write_file method, which will
  2843. write the file to the given `fname`.
  2844. Raises
  2845. ------
  2846. NotImplementedError
  2847. * If datetimes contain timezone information
  2848. ValueError
  2849. * Columns listed in convert_dates are neither datetime64[ns]
  2850. or datetime
  2851. * Column dtype is not representable in Stata
  2852. * Column listed in convert_dates is not in DataFrame
  2853. * Categorical label contains more than 32,000 characters
  2854. Examples
  2855. --------
  2856. >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])
  2857. >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data)
  2858. >>> writer.write_file()
  2859. Directly write a zip file
  2860. >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
  2861. >>> writer = pd.io.stata.StataWriter117(
  2862. ... './data_file.zip', data, compression=compression
  2863. ... )
  2864. >>> writer.write_file()
  2865. Or with long strings stored in strl format
  2866. >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],
  2867. ... columns=['strls'])
  2868. >>> writer = pd.io.stata.StataWriter117(
  2869. ... './data_file_with_long_strings.dta', data, convert_strl=['strls'])
  2870. >>> writer.write_file()
  2871. """
  2872. _max_string_length = 2045
  2873. _dta_version = 117
  2874. def __init__(
  2875. self,
  2876. fname: FilePath | WriteBuffer[bytes],
  2877. data: DataFrame,
  2878. convert_dates: dict[Hashable, str] | None = None,
  2879. write_index: bool = True,
  2880. byteorder: str | None = None,
  2881. time_stamp: datetime | None = None,
  2882. data_label: str | None = None,
  2883. variable_labels: dict[Hashable, str] | None = None,
  2884. convert_strl: Sequence[Hashable] | None = None,
  2885. compression: CompressionOptions = "infer",
  2886. storage_options: StorageOptions | None = None,
  2887. *,
  2888. value_labels: dict[Hashable, dict[float, str]] | None = None,
  2889. ) -> None:
  2890. # Copy to new list since convert_strl might be modified later
  2891. self._convert_strl: list[Hashable] = []
  2892. if convert_strl is not None:
  2893. self._convert_strl.extend(convert_strl)
  2894. super().__init__(
  2895. fname,
  2896. data,
  2897. convert_dates,
  2898. write_index,
  2899. byteorder=byteorder,
  2900. time_stamp=time_stamp,
  2901. data_label=data_label,
  2902. variable_labels=variable_labels,
  2903. value_labels=value_labels,
  2904. compression=compression,
  2905. storage_options=storage_options,
  2906. )
  2907. self._map: dict[str, int] = {}
  2908. self._strl_blob = b""
  2909. @staticmethod
  2910. def _tag(val: str | bytes, tag: str) -> bytes:
  2911. """Surround val with <tag></tag>"""
  2912. if isinstance(val, str):
  2913. val = bytes(val, "utf-8")
  2914. return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8")
  2915. def _update_map(self, tag: str) -> None:
  2916. """Update map location for tag with file position"""
  2917. assert self.handles.handle is not None
  2918. self._map[tag] = self.handles.handle.tell()
  2919. def _write_header(
  2920. self,
  2921. data_label: str | None = None,
  2922. time_stamp: datetime | None = None,
  2923. ) -> None:
  2924. """Write the file header"""
  2925. byteorder = self._byteorder
  2926. self._write_bytes(bytes("<stata_dta>", "utf-8"))
  2927. bio = BytesIO()
  2928. # ds_format - 117
  2929. bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
  2930. # byteorder
  2931. bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))
  2932. # number of vars, 2 bytes in 117 and 118, 4 byte in 119
  2933. nvar_type = "H" if self._dta_version <= 118 else "I"
  2934. bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
  2935. # 117 uses 4 bytes, 118 uses 8
  2936. nobs_size = "I" if self._dta_version == 117 else "Q"
  2937. bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
  2938. # data label 81 bytes, char, null terminated
  2939. label = data_label[:80] if data_label is not None else ""
  2940. encoded_label = label.encode(self._encoding)
  2941. label_size = "B" if self._dta_version == 117 else "H"
  2942. label_len = struct.pack(byteorder + label_size, len(encoded_label))
  2943. encoded_label = label_len + encoded_label
  2944. bio.write(self._tag(encoded_label, "label"))
  2945. # time stamp, 18 bytes, char, null terminated
  2946. # format dd Mon yyyy hh:mm
  2947. if time_stamp is None:
  2948. time_stamp = datetime.now()
  2949. elif not isinstance(time_stamp, datetime):
  2950. raise ValueError("time_stamp should be datetime type")
  2951. # Avoid locale-specific month conversion
  2952. months = [
  2953. "Jan",
  2954. "Feb",
  2955. "Mar",
  2956. "Apr",
  2957. "May",
  2958. "Jun",
  2959. "Jul",
  2960. "Aug",
  2961. "Sep",
  2962. "Oct",
  2963. "Nov",
  2964. "Dec",
  2965. ]
  2966. month_lookup = {i + 1: month for i, month in enumerate(months)}
  2967. ts = (
  2968. time_stamp.strftime("%d ")
  2969. + month_lookup[time_stamp.month]
  2970. + time_stamp.strftime(" %Y %H:%M")
  2971. )
  2972. # '\x11' added due to inspection of Stata file
  2973. stata_ts = b"\x11" + bytes(ts, "utf-8")
  2974. bio.write(self._tag(stata_ts, "timestamp"))
  2975. self._write_bytes(self._tag(bio.getvalue(), "header"))
  2976. def _write_map(self) -> None:
  2977. """
  2978. Called twice during file write. The first populates the values in
  2979. the map with 0s. The second call writes the final map locations when
  2980. all blocks have been written.
  2981. """
  2982. if not self._map:
  2983. self._map = {
  2984. "stata_data": 0,
  2985. "map": self.handles.handle.tell(),
  2986. "variable_types": 0,
  2987. "varnames": 0,
  2988. "sortlist": 0,
  2989. "formats": 0,
  2990. "value_label_names": 0,
  2991. "variable_labels": 0,
  2992. "characteristics": 0,
  2993. "data": 0,
  2994. "strls": 0,
  2995. "value_labels": 0,
  2996. "stata_data_close": 0,
  2997. "end-of-file": 0,
  2998. }
  2999. # Move to start of map
  3000. self.handles.handle.seek(self._map["map"])
  3001. bio = BytesIO()
  3002. for val in self._map.values():
  3003. bio.write(struct.pack(self._byteorder + "Q", val))
  3004. self._write_bytes(self._tag(bio.getvalue(), "map"))
  3005. def _write_variable_types(self) -> None:
  3006. self._update_map("variable_types")
  3007. bio = BytesIO()
  3008. for typ in self.typlist:
  3009. bio.write(struct.pack(self._byteorder + "H", typ))
  3010. self._write_bytes(self._tag(bio.getvalue(), "variable_types"))
  3011. def _write_varnames(self) -> None:
  3012. self._update_map("varnames")
  3013. bio = BytesIO()
  3014. # 118 scales by 4 to accommodate utf-8 data worst case encoding
  3015. vn_len = 32 if self._dta_version == 117 else 128
  3016. for name in self.varlist:
  3017. name = self._null_terminate_str(name)
  3018. name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1)
  3019. bio.write(name)
  3020. self._write_bytes(self._tag(bio.getvalue(), "varnames"))
  3021. def _write_sortlist(self) -> None:
  3022. self._update_map("sortlist")
  3023. sort_size = 2 if self._dta_version < 119 else 4
  3024. self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
  3025. def _write_formats(self) -> None:
  3026. self._update_map("formats")
  3027. bio = BytesIO()
  3028. fmt_len = 49 if self._dta_version == 117 else 57
  3029. for fmt in self.fmtlist:
  3030. bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len))
  3031. self._write_bytes(self._tag(bio.getvalue(), "formats"))
  3032. def _write_value_label_names(self) -> None:
  3033. self._update_map("value_label_names")
  3034. bio = BytesIO()
  3035. # 118 scales by 4 to accommodate utf-8 data worst case encoding
  3036. vl_len = 32 if self._dta_version == 117 else 128
  3037. for i in range(self.nvar):
  3038. # Use variable name when categorical
  3039. name = "" # default name
  3040. if self._has_value_labels[i]:
  3041. name = self.varlist[i]
  3042. name = self._null_terminate_str(name)
  3043. encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1)
  3044. bio.write(encoded_name)
  3045. self._write_bytes(self._tag(bio.getvalue(), "value_label_names"))
  3046. def _write_variable_labels(self) -> None:
  3047. # Missing labels are 80 blank characters plus null termination
  3048. self._update_map("variable_labels")
  3049. bio = BytesIO()
  3050. # 118 scales by 4 to accommodate utf-8 data worst case encoding
  3051. vl_len = 80 if self._dta_version == 117 else 320
  3052. blank = _pad_bytes_new("", vl_len + 1)
  3053. if self._variable_labels is None:
  3054. for _ in range(self.nvar):
  3055. bio.write(blank)
  3056. self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))
  3057. return
  3058. for col in self.data:
  3059. if col in self._variable_labels:
  3060. label = self._variable_labels[col]
  3061. if len(label) > 80:
  3062. raise ValueError("Variable labels must be 80 characters or fewer")
  3063. try:
  3064. encoded = label.encode(self._encoding)
  3065. except UnicodeEncodeError as err:
  3066. raise ValueError(
  3067. "Variable labels must contain only characters that "
  3068. f"can be encoded in {self._encoding}"
  3069. ) from err
  3070. bio.write(_pad_bytes_new(encoded, vl_len + 1))
  3071. else:
  3072. bio.write(blank)
  3073. self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))
  3074. def _write_characteristics(self) -> None:
  3075. self._update_map("characteristics")
  3076. self._write_bytes(self._tag(b"", "characteristics"))
  3077. def _write_data(self, records) -> None:
  3078. self._update_map("data")
  3079. self._write_bytes(b"<data>")
  3080. self._write_bytes(records.tobytes())
  3081. self._write_bytes(b"</data>")
  3082. def _write_strls(self) -> None:
  3083. self._update_map("strls")
  3084. self._write_bytes(self._tag(self._strl_blob, "strls"))
  3085. def _write_expansion_fields(self) -> None:
  3086. """No-op in dta 117+"""
  3087. def _write_value_labels(self) -> None:
  3088. self._update_map("value_labels")
  3089. bio = BytesIO()
  3090. for vl in self._value_labels:
  3091. lab = vl.generate_value_label(self._byteorder)
  3092. lab = self._tag(lab, "lbl")
  3093. bio.write(lab)
  3094. self._write_bytes(self._tag(bio.getvalue(), "value_labels"))
  3095. def _write_file_close_tag(self) -> None:
  3096. self._update_map("stata_data_close")
  3097. self._write_bytes(bytes("</stata_dta>", "utf-8"))
  3098. self._update_map("end-of-file")
  3099. def _update_strl_names(self) -> None:
  3100. """
  3101. Update column names for conversion to strl if they might have been
  3102. changed to comply with Stata naming rules
  3103. """
  3104. # Update convert_strl if names changed
  3105. for orig, new in self._converted_names.items():
  3106. if orig in self._convert_strl:
  3107. idx = self._convert_strl.index(orig)
  3108. self._convert_strl[idx] = new
  3109. def _convert_strls(self, data: DataFrame) -> DataFrame:
  3110. """
  3111. Convert columns to StrLs if either very large or in the
  3112. convert_strl variable
  3113. """
  3114. convert_cols = [
  3115. col
  3116. for i, col in enumerate(data)
  3117. if self.typlist[i] == 32768 or col in self._convert_strl
  3118. ]
  3119. if convert_cols:
  3120. ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)
  3121. tab, new_data = ssw.generate_table()
  3122. data = new_data
  3123. self._strl_blob = ssw.generate_blob(tab)
  3124. return data
  3125. def _set_formats_and_types(self, dtypes: Series) -> None:
  3126. self.typlist = []
  3127. self.fmtlist = []
  3128. for col, dtype in dtypes.items():
  3129. force_strl = col in self._convert_strl
  3130. fmt = _dtype_to_default_stata_fmt(
  3131. dtype,
  3132. self.data[col],
  3133. dta_version=self._dta_version,
  3134. force_strl=force_strl,
  3135. )
  3136. self.fmtlist.append(fmt)
  3137. self.typlist.append(
  3138. _dtype_to_stata_type_117(dtype, self.data[col], force_strl)
  3139. )
  3140. class StataWriterUTF8(StataWriter117):
  3141. """
  3142. Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
  3143. DTA 118 and 119 format files support unicode string data (both fixed
  3144. and strL) format. Unicode is also supported in value labels, variable
  3145. labels and the dataset label. Format 119 is automatically used if the
  3146. file contains more than 32,767 variables.
  3147. Parameters
  3148. ----------
  3149. fname : path (string), buffer or path object
  3150. string, path object (pathlib.Path or py._path.local.LocalPath) or
  3151. object implementing a binary write() functions. If using a buffer
  3152. then the buffer will not be automatically closed after the file
  3153. is written.
  3154. data : DataFrame
  3155. Input to save
  3156. convert_dates : dict, default None
  3157. Dictionary mapping columns containing datetime types to stata internal
  3158. format to use when writing the dates. Options are 'tc', 'td', 'tm',
  3159. 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
  3160. Datetime columns that do not have a conversion type specified will be
  3161. converted to 'tc'. Raises NotImplementedError if a datetime column has
  3162. timezone information
  3163. write_index : bool, default True
  3164. Write the index to Stata dataset.
  3165. byteorder : str, default None
  3166. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  3167. time_stamp : datetime, default None
  3168. A datetime to use as file creation date. Default is the current time
  3169. data_label : str, default None
  3170. A label for the data set. Must be 80 characters or smaller.
  3171. variable_labels : dict, default None
  3172. Dictionary containing columns as keys and variable labels as values.
  3173. Each label must be 80 characters or smaller.
  3174. convert_strl : list, default None
  3175. List of columns names to convert to Stata StrL format. Columns with
  3176. more than 2045 characters are automatically written as StrL.
  3177. Smaller columns can be converted by including the column name. Using
  3178. StrLs can reduce output file size when strings are longer than 8
  3179. characters, and either frequently repeated or sparse.
  3180. version : int, default None
  3181. The dta version to use. By default, uses the size of data to determine
  3182. the version. 118 is used if data.shape[1] <= 32767, and 119 is used
  3183. for storing larger DataFrames.
  3184. {compression_options}
  3185. .. versionchanged:: 1.4.0 Zstandard support.
  3186. value_labels : dict of dicts
  3187. Dictionary containing columns as keys and dictionaries of column value
  3188. to labels as values. The combined length of all labels for a single
  3189. variable must be 32,000 characters or smaller.
  3190. .. versionadded:: 1.4.0
  3191. Returns
  3192. -------
  3193. StataWriterUTF8
  3194. The instance has a write_file method, which will write the file to the
  3195. given `fname`.
  3196. Raises
  3197. ------
  3198. NotImplementedError
  3199. * If datetimes contain timezone information
  3200. ValueError
  3201. * Columns listed in convert_dates are neither datetime64[ns]
  3202. or datetime
  3203. * Column dtype is not representable in Stata
  3204. * Column listed in convert_dates is not in DataFrame
  3205. * Categorical label contains more than 32,000 characters
  3206. Examples
  3207. --------
  3208. Using Unicode data and column names
  3209. >>> from pandas.io.stata import StataWriterUTF8
  3210. >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
  3211. >>> writer = StataWriterUTF8('./data_file.dta', data)
  3212. >>> writer.write_file()
  3213. Directly write a zip file
  3214. >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
  3215. >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression)
  3216. >>> writer.write_file()
  3217. Or with long strings stored in strl format
  3218. >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
  3219. ... columns=['strls'])
  3220. >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,
  3221. ... convert_strl=['strls'])
  3222. >>> writer.write_file()
  3223. """
  3224. _encoding: Literal["utf-8"] = "utf-8"
  3225. def __init__(
  3226. self,
  3227. fname: FilePath | WriteBuffer[bytes],
  3228. data: DataFrame,
  3229. convert_dates: dict[Hashable, str] | None = None,
  3230. write_index: bool = True,
  3231. byteorder: str | None = None,
  3232. time_stamp: datetime | None = None,
  3233. data_label: str | None = None,
  3234. variable_labels: dict[Hashable, str] | None = None,
  3235. convert_strl: Sequence[Hashable] | None = None,
  3236. version: int | None = None,
  3237. compression: CompressionOptions = "infer",
  3238. storage_options: StorageOptions | None = None,
  3239. *,
  3240. value_labels: dict[Hashable, dict[float, str]] | None = None,
  3241. ) -> None:
  3242. if version is None:
  3243. version = 118 if data.shape[1] <= 32767 else 119
  3244. elif version not in (118, 119):
  3245. raise ValueError("version must be either 118 or 119.")
  3246. elif version == 118 and data.shape[1] > 32767:
  3247. raise ValueError(
  3248. "You must use version 119 for data sets containing more than"
  3249. "32,767 variables"
  3250. )
  3251. super().__init__(
  3252. fname,
  3253. data,
  3254. convert_dates=convert_dates,
  3255. write_index=write_index,
  3256. byteorder=byteorder,
  3257. time_stamp=time_stamp,
  3258. data_label=data_label,
  3259. variable_labels=variable_labels,
  3260. value_labels=value_labels,
  3261. convert_strl=convert_strl,
  3262. compression=compression,
  3263. storage_options=storage_options,
  3264. )
  3265. # Override version set in StataWriter117 init
  3266. self._dta_version = version
  3267. def _validate_variable_name(self, name: str) -> str:
  3268. """
  3269. Validate variable names for Stata export.
  3270. Parameters
  3271. ----------
  3272. name : str
  3273. Variable name
  3274. Returns
  3275. -------
  3276. str
  3277. The validated name with invalid characters replaced with
  3278. underscores.
  3279. Notes
  3280. -----
  3281. Stata 118+ support most unicode characters. The only limitation is in
  3282. the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
  3283. """
  3284. # High code points appear to be acceptable
  3285. for c in name:
  3286. if (
  3287. (
  3288. ord(c) < 128
  3289. and (c < "A" or c > "Z")
  3290. and (c < "a" or c > "z")
  3291. and (c < "0" or c > "9")
  3292. and c != "_"
  3293. )
  3294. or 128 <= ord(c) < 192
  3295. or c in {"×", "÷"} # noqa: RUF001
  3296. ):
  3297. name = name.replace(c, "_")
  3298. return name