test_stata.py 91 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398
  1. import bz2
  2. import datetime as dt
  3. from datetime import datetime
  4. import gzip
  5. import io
  6. import os
  7. import struct
  8. import tarfile
  9. import zipfile
  10. import numpy as np
  11. import pytest
  12. import pandas.util._test_decorators as td
  13. import pandas as pd
  14. from pandas import CategoricalDtype
  15. import pandas._testing as tm
  16. from pandas.core.frame import (
  17. DataFrame,
  18. Series,
  19. )
  20. from pandas.io.parsers import read_csv
  21. from pandas.io.stata import (
  22. CategoricalConversionWarning,
  23. InvalidColumnName,
  24. PossiblePrecisionLoss,
  25. StataMissingValue,
  26. StataReader,
  27. StataWriter,
  28. StataWriterUTF8,
  29. ValueLabelTypeMismatch,
  30. read_stata,
  31. )
  32. @pytest.fixture
  33. def mixed_frame():
  34. return DataFrame(
  35. {
  36. "a": [1, 2, 3, 4],
  37. "b": [1.0, 3.0, 27.0, 81.0],
  38. "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"],
  39. }
  40. )
  41. @pytest.fixture
  42. def parsed_114(datapath):
  43. dta14_114 = datapath("io", "data", "stata", "stata5_114.dta")
  44. parsed_114 = read_stata(dta14_114, convert_dates=True)
  45. parsed_114.index.name = "index"
  46. return parsed_114
  47. class TestStata:
  48. def read_dta(self, file):
  49. # Legacy default reader configuration
  50. return read_stata(file, convert_dates=True)
  51. def read_csv(self, file):
  52. return read_csv(file, parse_dates=True)
  53. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  54. def test_read_empty_dta(self, version):
  55. empty_ds = DataFrame(columns=["unit"])
  56. # GH 7369, make sure can read a 0-obs dta file
  57. with tm.ensure_clean() as path:
  58. empty_ds.to_stata(path, write_index=False, version=version)
  59. empty_ds2 = read_stata(path)
  60. tm.assert_frame_equal(empty_ds, empty_ds2)
  61. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  62. def test_read_empty_dta_with_dtypes(self, version):
  63. # GH 46240
  64. # Fixing above bug revealed that types are not correctly preserved when
  65. # writing empty DataFrames
  66. empty_df_typed = DataFrame(
  67. {
  68. "i8": np.array([0], dtype=np.int8),
  69. "i16": np.array([0], dtype=np.int16),
  70. "i32": np.array([0], dtype=np.int32),
  71. "i64": np.array([0], dtype=np.int64),
  72. "u8": np.array([0], dtype=np.uint8),
  73. "u16": np.array([0], dtype=np.uint16),
  74. "u32": np.array([0], dtype=np.uint32),
  75. "u64": np.array([0], dtype=np.uint64),
  76. "f32": np.array([0], dtype=np.float32),
  77. "f64": np.array([0], dtype=np.float64),
  78. }
  79. )
  80. expected = empty_df_typed.copy()
  81. # No uint# support. Downcast since values in range for int#
  82. expected["u8"] = expected["u8"].astype(np.int8)
  83. expected["u16"] = expected["u16"].astype(np.int16)
  84. expected["u32"] = expected["u32"].astype(np.int32)
  85. # No int64 supported at all. Downcast since values in range for int32
  86. expected["u64"] = expected["u64"].astype(np.int32)
  87. expected["i64"] = expected["i64"].astype(np.int32)
  88. # GH 7369, make sure can read a 0-obs dta file
  89. with tm.ensure_clean() as path:
  90. empty_df_typed.to_stata(path, write_index=False, version=version)
  91. empty_reread = read_stata(path)
  92. tm.assert_frame_equal(expected, empty_reread)
  93. tm.assert_series_equal(expected.dtypes, empty_reread.dtypes)
  94. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  95. def test_read_index_col_none(self, version):
  96. df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]})
  97. # GH 7369, make sure can read a 0-obs dta file
  98. with tm.ensure_clean() as path:
  99. df.to_stata(path, write_index=False, version=version)
  100. read_df = read_stata(path)
  101. assert isinstance(read_df.index, pd.RangeIndex)
  102. expected = df.copy()
  103. expected["a"] = expected["a"].astype(np.int32)
  104. tm.assert_frame_equal(read_df, expected, check_index_type=True)
  105. @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"])
  106. def test_read_dta1(self, file, datapath):
  107. file = datapath("io", "data", "stata", f"{file}.dta")
  108. parsed = self.read_dta(file)
  109. # Pandas uses np.nan as missing value.
  110. # Thus, all columns will be of type float, regardless of their name.
  111. expected = DataFrame(
  112. [(np.nan, np.nan, np.nan, np.nan, np.nan)],
  113. columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"],
  114. )
  115. # this is an oddity as really the nan should be float64, but
  116. # the casting doesn't fail so need to match stata here
  117. expected["float_miss"] = expected["float_miss"].astype(np.float32)
  118. tm.assert_frame_equal(parsed, expected)
  119. def test_read_dta2(self, datapath):
  120. expected = DataFrame.from_records(
  121. [
  122. (
  123. datetime(2006, 11, 19, 23, 13, 20),
  124. 1479596223000,
  125. datetime(2010, 1, 20),
  126. datetime(2010, 1, 8),
  127. datetime(2010, 1, 1),
  128. datetime(1974, 7, 1),
  129. datetime(2010, 1, 1),
  130. datetime(2010, 1, 1),
  131. ),
  132. (
  133. datetime(1959, 12, 31, 20, 3, 20),
  134. -1479590,
  135. datetime(1953, 10, 2),
  136. datetime(1948, 6, 10),
  137. datetime(1955, 1, 1),
  138. datetime(1955, 7, 1),
  139. datetime(1955, 1, 1),
  140. datetime(2, 1, 1),
  141. ),
  142. (pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT),
  143. ],
  144. columns=[
  145. "datetime_c",
  146. "datetime_big_c",
  147. "date",
  148. "weekly_date",
  149. "monthly_date",
  150. "quarterly_date",
  151. "half_yearly_date",
  152. "yearly_date",
  153. ],
  154. )
  155. expected["yearly_date"] = expected["yearly_date"].astype("O")
  156. path1 = datapath("io", "data", "stata", "stata2_114.dta")
  157. path2 = datapath("io", "data", "stata", "stata2_115.dta")
  158. path3 = datapath("io", "data", "stata", "stata2_117.dta")
  159. with tm.assert_produces_warning(UserWarning):
  160. parsed_114 = self.read_dta(path1)
  161. with tm.assert_produces_warning(UserWarning):
  162. parsed_115 = self.read_dta(path2)
  163. with tm.assert_produces_warning(UserWarning):
  164. parsed_117 = self.read_dta(path3)
  165. # FIXME: don't leave commented-out
  166. # 113 is buggy due to limits of date format support in Stata
  167. # parsed_113 = self.read_dta(
  168. # datapath("io", "data", "stata", "stata2_113.dta")
  169. # )
  170. # FIXME: don't leave commented-out
  171. # buggy test because of the NaT comparison on certain platforms
  172. # Format 113 test fails since it does not support tc and tC formats
  173. # tm.assert_frame_equal(parsed_113, expected)
  174. tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True)
  175. tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True)
  176. tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True)
  177. @pytest.mark.parametrize(
  178. "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"]
  179. )
  180. def test_read_dta3(self, file, datapath):
  181. file = datapath("io", "data", "stata", f"{file}.dta")
  182. parsed = self.read_dta(file)
  183. # match stata here
  184. expected = self.read_csv(datapath("io", "data", "stata", "stata3.csv"))
  185. expected = expected.astype(np.float32)
  186. expected["year"] = expected["year"].astype(np.int16)
  187. expected["quarter"] = expected["quarter"].astype(np.int8)
  188. tm.assert_frame_equal(parsed, expected)
  189. @pytest.mark.parametrize(
  190. "file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"]
  191. )
  192. def test_read_dta4(self, file, datapath):
  193. file = datapath("io", "data", "stata", f"{file}.dta")
  194. parsed = self.read_dta(file)
  195. expected = DataFrame.from_records(
  196. [
  197. ["one", "ten", "one", "one", "one"],
  198. ["two", "nine", "two", "two", "two"],
  199. ["three", "eight", "three", "three", "three"],
  200. ["four", "seven", 4, "four", "four"],
  201. ["five", "six", 5, np.nan, "five"],
  202. ["six", "five", 6, np.nan, "six"],
  203. ["seven", "four", 7, np.nan, "seven"],
  204. ["eight", "three", 8, np.nan, "eight"],
  205. ["nine", "two", 9, np.nan, "nine"],
  206. ["ten", "one", "ten", np.nan, "ten"],
  207. ],
  208. columns=[
  209. "fully_labeled",
  210. "fully_labeled2",
  211. "incompletely_labeled",
  212. "labeled_with_missings",
  213. "float_labelled",
  214. ],
  215. )
  216. # these are all categoricals
  217. for col in expected:
  218. orig = expected[col].copy()
  219. categories = np.asarray(expected["fully_labeled"][orig.notna()])
  220. if col == "incompletely_labeled":
  221. categories = orig
  222. cat = orig.astype("category")._values
  223. cat = cat.set_categories(categories, ordered=True)
  224. cat.categories.rename(None, inplace=True)
  225. expected[col] = cat
  226. # stata doesn't save .category metadata
  227. tm.assert_frame_equal(parsed, expected)
  228. # File containing strls
  229. def test_read_dta12(self, datapath):
  230. parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))
  231. expected = DataFrame.from_records(
  232. [
  233. [1, "abc", "abcdefghi"],
  234. [3, "cba", "qwertywertyqwerty"],
  235. [93, "", "strl"],
  236. ],
  237. columns=["x", "y", "z"],
  238. )
  239. tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
  240. def test_read_dta18(self, datapath):
  241. parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta"))
  242. parsed_118["Bytes"] = parsed_118["Bytes"].astype("O")
  243. expected = DataFrame.from_records(
  244. [
  245. ["Cat", "Bogota", "Bogotá", 1, 1.0, "option b Ünicode", 1.0],
  246. ["Dog", "Boston", "Uzunköprü", np.nan, np.nan, np.nan, np.nan],
  247. ["Plane", "Rome", "Tromsø", 0, 0.0, "option a", 0.0],
  248. ["Potato", "Tokyo", "Elâzığ", -4, 4.0, 4, 4], # noqa: RUF001
  249. ["", "", "", 0, 0.3332999, "option a", 1 / 3.0],
  250. ],
  251. columns=[
  252. "Things",
  253. "Cities",
  254. "Unicode_Cities_Strl",
  255. "Ints",
  256. "Floats",
  257. "Bytes",
  258. "Longs",
  259. ],
  260. )
  261. expected["Floats"] = expected["Floats"].astype(np.float32)
  262. for col in parsed_118.columns:
  263. tm.assert_almost_equal(parsed_118[col], expected[col])
  264. with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr:
  265. vl = rdr.variable_labels()
  266. vl_expected = {
  267. "Unicode_Cities_Strl": "Here are some strls with Ünicode chars",
  268. "Longs": "long data",
  269. "Things": "Here are some things",
  270. "Bytes": "byte data",
  271. "Ints": "int data",
  272. "Cities": "Here are some cities",
  273. "Floats": "float data",
  274. }
  275. tm.assert_dict_equal(vl, vl_expected)
  276. assert rdr.data_label == "This is a Ünicode data label"
  277. def test_read_write_dta5(self):
  278. original = DataFrame(
  279. [(np.nan, np.nan, np.nan, np.nan, np.nan)],
  280. columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"],
  281. )
  282. original.index.name = "index"
  283. with tm.ensure_clean() as path:
  284. original.to_stata(path, convert_dates=None)
  285. written_and_read_again = self.read_dta(path)
  286. expected = original.copy()
  287. expected.index = expected.index.astype(np.int32)
  288. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  289. def test_write_dta6(self, datapath):
  290. original = self.read_csv(datapath("io", "data", "stata", "stata3.csv"))
  291. original.index.name = "index"
  292. original.index = original.index.astype(np.int32)
  293. original["year"] = original["year"].astype(np.int32)
  294. original["quarter"] = original["quarter"].astype(np.int32)
  295. with tm.ensure_clean() as path:
  296. original.to_stata(path, convert_dates=None)
  297. written_and_read_again = self.read_dta(path)
  298. tm.assert_frame_equal(
  299. written_and_read_again.set_index("index"),
  300. original,
  301. check_index_type=False,
  302. )
  303. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  304. def test_read_write_dta10(self, version, using_infer_string):
  305. original = DataFrame(
  306. data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]],
  307. columns=["string", "object", "integer", "floating", "datetime"],
  308. )
  309. original["object"] = Series(original["object"], dtype=object)
  310. original.index.name = "index"
  311. original.index = original.index.astype(np.int32)
  312. original["integer"] = original["integer"].astype(np.int32)
  313. with tm.ensure_clean() as path:
  314. original.to_stata(path, convert_dates={"datetime": "tc"}, version=version)
  315. written_and_read_again = self.read_dta(path)
  316. expected = original.copy()
  317. if using_infer_string:
  318. expected["object"] = expected["object"].astype("str")
  319. # original.index is np.int32, read index is np.int64
  320. tm.assert_frame_equal(
  321. written_and_read_again.set_index("index"),
  322. expected,
  323. check_index_type=False,
  324. )
  325. def test_stata_doc_examples(self):
  326. with tm.ensure_clean() as path:
  327. df = DataFrame(
  328. np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
  329. )
  330. df.to_stata(path)
  331. def test_write_preserves_original(self):
  332. # 9795
  333. df = DataFrame(
  334. np.random.default_rng(2).standard_normal((5, 4)), columns=list("abcd")
  335. )
  336. df.loc[2, "a":"c"] = np.nan
  337. df_copy = df.copy()
  338. with tm.ensure_clean() as path:
  339. df.to_stata(path, write_index=False)
  340. tm.assert_frame_equal(df, df_copy)
  341. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  342. def test_encoding(self, version, datapath):
  343. # GH 4626, proper encoding handling
  344. raw = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta"))
  345. encoded = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta"))
  346. result = encoded.kreis1849[0]
  347. expected = raw.kreis1849[0]
  348. assert result == expected
  349. assert isinstance(result, str)
  350. with tm.ensure_clean() as path:
  351. encoded.to_stata(path, write_index=False, version=version)
  352. reread_encoded = read_stata(path)
  353. tm.assert_frame_equal(encoded, reread_encoded)
  354. def test_read_write_dta11(self):
  355. original = DataFrame(
  356. [(1, 2, 3, 4)],
  357. columns=[
  358. "good",
  359. "b\u00E4d",
  360. "8number",
  361. "astringwithmorethan32characters______",
  362. ],
  363. )
  364. formatted = DataFrame(
  365. [(1, 2, 3, 4)],
  366. columns=["good", "b_d", "_8number", "astringwithmorethan32characters_"],
  367. )
  368. formatted.index.name = "index"
  369. formatted = formatted.astype(np.int32)
  370. with tm.ensure_clean() as path:
  371. with tm.assert_produces_warning(InvalidColumnName):
  372. original.to_stata(path, convert_dates=None)
  373. written_and_read_again = self.read_dta(path)
  374. expected = formatted.copy()
  375. expected.index = expected.index.astype(np.int32)
  376. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  377. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  378. def test_read_write_dta12(self, version):
  379. original = DataFrame(
  380. [(1, 2, 3, 4, 5, 6)],
  381. columns=[
  382. "astringwithmorethan32characters_1",
  383. "astringwithmorethan32characters_2",
  384. "+",
  385. "-",
  386. "short",
  387. "delete",
  388. ],
  389. )
  390. formatted = DataFrame(
  391. [(1, 2, 3, 4, 5, 6)],
  392. columns=[
  393. "astringwithmorethan32characters_",
  394. "_0astringwithmorethan32character",
  395. "_",
  396. "_1_",
  397. "_short",
  398. "_delete",
  399. ],
  400. )
  401. formatted.index.name = "index"
  402. formatted = formatted.astype(np.int32)
  403. with tm.ensure_clean() as path:
  404. with tm.assert_produces_warning(InvalidColumnName):
  405. original.to_stata(path, convert_dates=None, version=version)
  406. # should get a warning for that format.
  407. written_and_read_again = self.read_dta(path)
  408. expected = formatted.copy()
  409. expected.index = expected.index.astype(np.int32)
  410. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  411. def test_read_write_dta13(self):
  412. s1 = Series(2**9, dtype=np.int16)
  413. s2 = Series(2**17, dtype=np.int32)
  414. s3 = Series(2**33, dtype=np.int64)
  415. original = DataFrame({"int16": s1, "int32": s2, "int64": s3})
  416. original.index.name = "index"
  417. formatted = original
  418. formatted["int64"] = formatted["int64"].astype(np.float64)
  419. with tm.ensure_clean() as path:
  420. original.to_stata(path)
  421. written_and_read_again = self.read_dta(path)
  422. expected = formatted.copy()
  423. expected.index = expected.index.astype(np.int32)
  424. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  425. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  426. @pytest.mark.parametrize(
  427. "file", ["stata5_113", "stata5_114", "stata5_115", "stata5_117"]
  428. )
  429. def test_read_write_reread_dta14(self, file, parsed_114, version, datapath):
  430. file = datapath("io", "data", "stata", f"{file}.dta")
  431. parsed = self.read_dta(file)
  432. parsed.index.name = "index"
  433. tm.assert_frame_equal(parsed_114, parsed)
  434. with tm.ensure_clean() as path:
  435. parsed_114.to_stata(path, convert_dates={"date_td": "td"}, version=version)
  436. written_and_read_again = self.read_dta(path)
  437. expected = parsed_114.copy()
  438. expected.index = expected.index.astype(np.int32)
  439. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  440. @pytest.mark.parametrize(
  441. "file", ["stata6_113", "stata6_114", "stata6_115", "stata6_117"]
  442. )
  443. def test_read_write_reread_dta15(self, file, datapath):
  444. expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
  445. expected["byte_"] = expected["byte_"].astype(np.int8)
  446. expected["int_"] = expected["int_"].astype(np.int16)
  447. expected["long_"] = expected["long_"].astype(np.int32)
  448. expected["float_"] = expected["float_"].astype(np.float32)
  449. expected["double_"] = expected["double_"].astype(np.float64)
  450. expected["date_td"] = expected["date_td"].apply(
  451. datetime.strptime, args=("%Y-%m-%d",)
  452. )
  453. file = datapath("io", "data", "stata", f"{file}.dta")
  454. parsed = self.read_dta(file)
  455. tm.assert_frame_equal(expected, parsed)
  456. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  457. def test_timestamp_and_label(self, version):
  458. original = DataFrame([(1,)], columns=["variable"])
  459. time_stamp = datetime(2000, 2, 29, 14, 21)
  460. data_label = "This is a data file."
  461. with tm.ensure_clean() as path:
  462. original.to_stata(
  463. path, time_stamp=time_stamp, data_label=data_label, version=version
  464. )
  465. with StataReader(path) as reader:
  466. assert reader.time_stamp == "29 Feb 2000 14:21"
  467. assert reader.data_label == data_label
  468. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  469. def test_invalid_timestamp(self, version):
  470. original = DataFrame([(1,)], columns=["variable"])
  471. time_stamp = "01 Jan 2000, 00:00:00"
  472. with tm.ensure_clean() as path:
  473. msg = "time_stamp should be datetime type"
  474. with pytest.raises(ValueError, match=msg):
  475. original.to_stata(path, time_stamp=time_stamp, version=version)
  476. assert not os.path.isfile(path)
  477. def test_numeric_column_names(self):
  478. original = DataFrame(np.reshape(np.arange(25.0), (5, 5)))
  479. original.index.name = "index"
  480. with tm.ensure_clean() as path:
  481. # should get a warning for that format.
  482. with tm.assert_produces_warning(InvalidColumnName):
  483. original.to_stata(path)
  484. written_and_read_again = self.read_dta(path)
  485. written_and_read_again = written_and_read_again.set_index("index")
  486. columns = list(written_and_read_again.columns)
  487. convert_col_name = lambda x: int(x[1])
  488. written_and_read_again.columns = map(convert_col_name, columns)
  489. expected = original.copy()
  490. expected.index = expected.index.astype(np.int32)
  491. tm.assert_frame_equal(expected, written_and_read_again)
  492. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  493. def test_nan_to_missing_value(self, version):
  494. s1 = Series(np.arange(4.0), dtype=np.float32)
  495. s2 = Series(np.arange(4.0), dtype=np.float64)
  496. s1[::2] = np.nan
  497. s2[1::2] = np.nan
  498. original = DataFrame({"s1": s1, "s2": s2})
  499. original.index.name = "index"
  500. with tm.ensure_clean() as path:
  501. original.to_stata(path, version=version)
  502. written_and_read_again = self.read_dta(path)
  503. written_and_read_again = written_and_read_again.set_index("index")
  504. expected = original.copy()
  505. expected.index = expected.index.astype(np.int32)
  506. tm.assert_frame_equal(written_and_read_again, expected)
  507. def test_no_index(self):
  508. columns = ["x", "y"]
  509. original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns)
  510. original.index.name = "index_not_written"
  511. with tm.ensure_clean() as path:
  512. original.to_stata(path, write_index=False)
  513. written_and_read_again = self.read_dta(path)
  514. with pytest.raises(KeyError, match=original.index.name):
  515. written_and_read_again["index_not_written"]
  516. def test_string_no_dates(self):
  517. s1 = Series(["a", "A longer string"])
  518. s2 = Series([1.0, 2.0], dtype=np.float64)
  519. original = DataFrame({"s1": s1, "s2": s2})
  520. original.index.name = "index"
  521. with tm.ensure_clean() as path:
  522. original.to_stata(path)
  523. written_and_read_again = self.read_dta(path)
  524. expected = original.copy()
  525. expected.index = expected.index.astype(np.int32)
  526. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  527. def test_large_value_conversion(self):
  528. s0 = Series([1, 99], dtype=np.int8)
  529. s1 = Series([1, 127], dtype=np.int8)
  530. s2 = Series([1, 2**15 - 1], dtype=np.int16)
  531. s3 = Series([1, 2**63 - 1], dtype=np.int64)
  532. original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3})
  533. original.index.name = "index"
  534. with tm.ensure_clean() as path:
  535. with tm.assert_produces_warning(PossiblePrecisionLoss):
  536. original.to_stata(path)
  537. written_and_read_again = self.read_dta(path)
  538. modified = original.copy()
  539. modified["s1"] = Series(modified["s1"], dtype=np.int16)
  540. modified["s2"] = Series(modified["s2"], dtype=np.int32)
  541. modified["s3"] = Series(modified["s3"], dtype=np.float64)
  542. modified.index = original.index.astype(np.int32)
  543. tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
  544. def test_dates_invalid_column(self):
  545. original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
  546. original.index.name = "index"
  547. with tm.ensure_clean() as path:
  548. with tm.assert_produces_warning(InvalidColumnName):
  549. original.to_stata(path, convert_dates={0: "tc"})
  550. written_and_read_again = self.read_dta(path)
  551. modified = original.copy()
  552. modified.columns = ["_0"]
  553. modified.index = original.index.astype(np.int32)
  554. tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
  555. def test_105(self, datapath):
  556. # Data obtained from:
  557. # http://go.worldbank.org/ZXY29PVJ21
  558. dpath = datapath("io", "data", "stata", "S4_EDUC1.dta")
  559. df = read_stata(dpath)
  560. df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
  561. df0 = DataFrame(df0)
  562. df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
  563. df0["clustnum"] = df0["clustnum"].astype(np.int16)
  564. df0["pri_schl"] = df0["pri_schl"].astype(np.int8)
  565. df0["psch_num"] = df0["psch_num"].astype(np.int8)
  566. df0["psch_dis"] = df0["psch_dis"].astype(np.float32)
  567. tm.assert_frame_equal(df.head(3), df0)
  568. def test_value_labels_old_format(self, datapath):
  569. # GH 19417
  570. #
  571. # Test that value_labels() returns an empty dict if the file format
  572. # predates supporting value labels.
  573. dpath = datapath("io", "data", "stata", "S4_EDUC1.dta")
  574. with StataReader(dpath) as reader:
  575. assert reader.value_labels() == {}
  576. def test_date_export_formats(self):
  577. columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"]
  578. conversions = {c: c for c in columns}
  579. data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
  580. original = DataFrame([data], columns=columns)
  581. original.index.name = "index"
  582. expected_values = [
  583. datetime(2006, 11, 20, 23, 13, 20), # Time
  584. datetime(2006, 11, 20), # Day
  585. datetime(2006, 11, 19), # Week
  586. datetime(2006, 11, 1), # Month
  587. datetime(2006, 10, 1), # Quarter year
  588. datetime(2006, 7, 1), # Half year
  589. datetime(2006, 1, 1),
  590. ] # Year
  591. expected = DataFrame(
  592. [expected_values],
  593. index=pd.Index([0], dtype=np.int32, name="index"),
  594. columns=columns,
  595. )
  596. with tm.ensure_clean() as path:
  597. original.to_stata(path, convert_dates=conversions)
  598. written_and_read_again = self.read_dta(path)
  599. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  600. def test_write_missing_strings(self):
  601. original = DataFrame([["1"], [None]], columns=["foo"])
  602. expected = DataFrame(
  603. [["1"], [""]],
  604. index=pd.Index([0, 1], dtype=np.int32, name="index"),
  605. columns=["foo"],
  606. )
  607. with tm.ensure_clean() as path:
  608. original.to_stata(path)
  609. written_and_read_again = self.read_dta(path)
  610. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  611. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  612. @pytest.mark.parametrize("byteorder", [">", "<"])
  613. def test_bool_uint(self, byteorder, version):
  614. s0 = Series([0, 1, True], dtype=np.bool_)
  615. s1 = Series([0, 1, 100], dtype=np.uint8)
  616. s2 = Series([0, 1, 255], dtype=np.uint8)
  617. s3 = Series([0, 1, 2**15 - 100], dtype=np.uint16)
  618. s4 = Series([0, 1, 2**16 - 1], dtype=np.uint16)
  619. s5 = Series([0, 1, 2**31 - 100], dtype=np.uint32)
  620. s6 = Series([0, 1, 2**32 - 1], dtype=np.uint32)
  621. original = DataFrame(
  622. {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6}
  623. )
  624. original.index.name = "index"
  625. expected = original.copy()
  626. expected.index = original.index.astype(np.int32)
  627. expected_types = (
  628. np.int8,
  629. np.int8,
  630. np.int16,
  631. np.int16,
  632. np.int32,
  633. np.int32,
  634. np.float64,
  635. )
  636. for c, t in zip(expected.columns, expected_types):
  637. expected[c] = expected[c].astype(t)
  638. with tm.ensure_clean() as path:
  639. original.to_stata(path, byteorder=byteorder, version=version)
  640. written_and_read_again = self.read_dta(path)
  641. written_and_read_again = written_and_read_again.set_index("index")
  642. tm.assert_frame_equal(written_and_read_again, expected)
  643. def test_variable_labels(self, datapath):
  644. with StataReader(datapath("io", "data", "stata", "stata7_115.dta")) as rdr:
  645. sr_115 = rdr.variable_labels()
  646. with StataReader(datapath("io", "data", "stata", "stata7_117.dta")) as rdr:
  647. sr_117 = rdr.variable_labels()
  648. keys = ("var1", "var2", "var3")
  649. labels = ("label1", "label2", "label3")
  650. for k, v in sr_115.items():
  651. assert k in sr_117
  652. assert v == sr_117[k]
  653. assert k in keys
  654. assert v in labels
  655. def test_minimal_size_col(self):
  656. str_lens = (1, 100, 244)
  657. s = {}
  658. for str_len in str_lens:
  659. s["s" + str(str_len)] = Series(
  660. ["a" * str_len, "b" * str_len, "c" * str_len]
  661. )
  662. original = DataFrame(s)
  663. with tm.ensure_clean() as path:
  664. original.to_stata(path, write_index=False)
  665. with StataReader(path) as sr:
  666. sr._ensure_open() # The `_*list` variables are initialized here
  667. for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist):
  668. assert int(variable[1:]) == int(fmt[1:-1])
  669. assert int(variable[1:]) == typ
  670. def test_excessively_long_string(self):
  671. str_lens = (1, 244, 500)
  672. s = {}
  673. for str_len in str_lens:
  674. s["s" + str(str_len)] = Series(
  675. ["a" * str_len, "b" * str_len, "c" * str_len]
  676. )
  677. original = DataFrame(s)
  678. msg = (
  679. r"Fixed width strings in Stata \.dta files are limited to 244 "
  680. r"\(or fewer\)\ncharacters\. Column 's500' does not satisfy "
  681. r"this restriction\. Use the\n'version=117' parameter to write "
  682. r"the newer \(Stata 13 and later\) format\."
  683. )
  684. with pytest.raises(ValueError, match=msg):
  685. with tm.ensure_clean() as path:
  686. original.to_stata(path)
  687. def test_missing_value_generator(self):
  688. types = ("b", "h", "l")
  689. df = DataFrame([[0.0]], columns=["float_"])
  690. with tm.ensure_clean() as path:
  691. df.to_stata(path)
  692. with StataReader(path) as rdr:
  693. valid_range = rdr.VALID_RANGE
  694. expected_values = ["." + chr(97 + i) for i in range(26)]
  695. expected_values.insert(0, ".")
  696. for t in types:
  697. offset = valid_range[t][1]
  698. for i in range(27):
  699. val = StataMissingValue(offset + 1 + i)
  700. assert val.string == expected_values[i]
  701. # Test extremes for floats
  702. val = StataMissingValue(struct.unpack("<f", b"\x00\x00\x00\x7f")[0])
  703. assert val.string == "."
  704. val = StataMissingValue(struct.unpack("<f", b"\x00\xd0\x00\x7f")[0])
  705. assert val.string == ".z"
  706. # Test extremes for floats
  707. val = StataMissingValue(
  708. struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
  709. )
  710. assert val.string == "."
  711. val = StataMissingValue(
  712. struct.unpack("<d", b"\x00\x00\x00\x00\x00\x1a\xe0\x7f")[0]
  713. )
  714. assert val.string == ".z"
  715. @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"])
  716. def test_missing_value_conversion(self, file, datapath):
  717. columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
  718. smv = StataMissingValue(101)
  719. keys = sorted(smv.MISSING_VALUES.keys())
  720. data = []
  721. for i in range(27):
  722. row = [StataMissingValue(keys[i + (j * 27)]) for j in range(5)]
  723. data.append(row)
  724. expected = DataFrame(data, columns=columns)
  725. parsed = read_stata(
  726. datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
  727. )
  728. tm.assert_frame_equal(parsed, expected)
  729. def test_big_dates(self, datapath):
  730. yr = [1960, 2000, 9999, 100, 2262, 1677]
  731. mo = [1, 1, 12, 1, 4, 9]
  732. dd = [1, 1, 31, 1, 22, 23]
  733. hr = [0, 0, 23, 0, 0, 0]
  734. mm = [0, 0, 59, 0, 0, 0]
  735. ss = [0, 0, 59, 0, 0, 0]
  736. expected = []
  737. for year, month, day, hour, minute, second in zip(yr, mo, dd, hr, mm, ss):
  738. row = []
  739. for j in range(7):
  740. if j == 0:
  741. row.append(datetime(year, month, day, hour, minute, second))
  742. elif j == 6:
  743. row.append(datetime(year, 1, 1))
  744. else:
  745. row.append(datetime(year, month, day))
  746. expected.append(row)
  747. expected.append([pd.NaT] * 7)
  748. columns = [
  749. "date_tc",
  750. "date_td",
  751. "date_tw",
  752. "date_tm",
  753. "date_tq",
  754. "date_th",
  755. "date_ty",
  756. ]
  757. # Fixes for weekly, quarterly,half,year
  758. expected[2][2] = datetime(9999, 12, 24)
  759. expected[2][3] = datetime(9999, 12, 1)
  760. expected[2][4] = datetime(9999, 10, 1)
  761. expected[2][5] = datetime(9999, 7, 1)
  762. expected[4][2] = datetime(2262, 4, 16)
  763. expected[4][3] = expected[4][4] = datetime(2262, 4, 1)
  764. expected[4][5] = expected[4][6] = datetime(2262, 1, 1)
  765. expected[5][2] = expected[5][3] = expected[5][4] = datetime(1677, 10, 1)
  766. expected[5][5] = expected[5][6] = datetime(1678, 1, 1)
  767. expected = DataFrame(expected, columns=columns, dtype=object)
  768. parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta"))
  769. parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta"))
  770. tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True)
  771. tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True)
  772. date_conversion = {c: c[-2:] for c in columns}
  773. # {c : c[-2:] for c in columns}
  774. with tm.ensure_clean() as path:
  775. expected.index.name = "index"
  776. expected.to_stata(path, convert_dates=date_conversion)
  777. written_and_read_again = self.read_dta(path)
  778. tm.assert_frame_equal(
  779. written_and_read_again.set_index("index"),
  780. expected.set_index(expected.index.astype(np.int32)),
  781. check_datetimelike_compat=True,
  782. )
  783. def test_dtype_conversion(self, datapath):
  784. expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
  785. expected["byte_"] = expected["byte_"].astype(np.int8)
  786. expected["int_"] = expected["int_"].astype(np.int16)
  787. expected["long_"] = expected["long_"].astype(np.int32)
  788. expected["float_"] = expected["float_"].astype(np.float32)
  789. expected["double_"] = expected["double_"].astype(np.float64)
  790. expected["date_td"] = expected["date_td"].apply(
  791. datetime.strptime, args=("%Y-%m-%d",)
  792. )
  793. no_conversion = read_stata(
  794. datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True
  795. )
  796. tm.assert_frame_equal(expected, no_conversion)
  797. conversion = read_stata(
  798. datapath("io", "data", "stata", "stata6_117.dta"),
  799. convert_dates=True,
  800. preserve_dtypes=False,
  801. )
  802. # read_csv types are the same
  803. expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
  804. expected["date_td"] = expected["date_td"].apply(
  805. datetime.strptime, args=("%Y-%m-%d",)
  806. )
  807. tm.assert_frame_equal(expected, conversion)
  808. def test_drop_column(self, datapath):
  809. expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
  810. expected["byte_"] = expected["byte_"].astype(np.int8)
  811. expected["int_"] = expected["int_"].astype(np.int16)
  812. expected["long_"] = expected["long_"].astype(np.int32)
  813. expected["float_"] = expected["float_"].astype(np.float32)
  814. expected["double_"] = expected["double_"].astype(np.float64)
  815. expected["date_td"] = expected["date_td"].apply(
  816. datetime.strptime, args=("%Y-%m-%d",)
  817. )
  818. columns = ["byte_", "int_", "long_"]
  819. expected = expected[columns]
  820. dropped = read_stata(
  821. datapath("io", "data", "stata", "stata6_117.dta"),
  822. convert_dates=True,
  823. columns=columns,
  824. )
  825. tm.assert_frame_equal(expected, dropped)
  826. # See PR 10757
  827. columns = ["int_", "long_", "byte_"]
  828. expected = expected[columns]
  829. reordered = read_stata(
  830. datapath("io", "data", "stata", "stata6_117.dta"),
  831. convert_dates=True,
  832. columns=columns,
  833. )
  834. tm.assert_frame_equal(expected, reordered)
  835. msg = "columns contains duplicate entries"
  836. with pytest.raises(ValueError, match=msg):
  837. columns = ["byte_", "byte_"]
  838. read_stata(
  839. datapath("io", "data", "stata", "stata6_117.dta"),
  840. convert_dates=True,
  841. columns=columns,
  842. )
  843. msg = "The following columns were not found in the Stata data set: not_found"
  844. with pytest.raises(ValueError, match=msg):
  845. columns = ["byte_", "int_", "long_", "not_found"]
  846. read_stata(
  847. datapath("io", "data", "stata", "stata6_117.dta"),
  848. convert_dates=True,
  849. columns=columns,
  850. )
  851. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  852. @pytest.mark.filterwarnings(
  853. "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch"
  854. )
  855. def test_categorical_writing(self, version):
  856. original = DataFrame.from_records(
  857. [
  858. ["one", "ten", "one", "one", "one", 1],
  859. ["two", "nine", "two", "two", "two", 2],
  860. ["three", "eight", "three", "three", "three", 3],
  861. ["four", "seven", 4, "four", "four", 4],
  862. ["five", "six", 5, np.nan, "five", 5],
  863. ["six", "five", 6, np.nan, "six", 6],
  864. ["seven", "four", 7, np.nan, "seven", 7],
  865. ["eight", "three", 8, np.nan, "eight", 8],
  866. ["nine", "two", 9, np.nan, "nine", 9],
  867. ["ten", "one", "ten", np.nan, "ten", 10],
  868. ],
  869. columns=[
  870. "fully_labeled",
  871. "fully_labeled2",
  872. "incompletely_labeled",
  873. "labeled_with_missings",
  874. "float_labelled",
  875. "unlabeled",
  876. ],
  877. )
  878. expected = original.copy()
  879. # these are all categoricals
  880. original = pd.concat(
  881. [original[col].astype("category") for col in original], axis=1
  882. )
  883. expected.index = expected.index.set_names("index").astype(np.int32)
  884. expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str)
  885. expected["unlabeled"] = expected["unlabeled"].apply(str)
  886. for col in expected:
  887. orig = expected[col].copy()
  888. cat = orig.astype("category")._values
  889. cat = cat.as_ordered()
  890. if col == "unlabeled":
  891. cat = cat.set_categories(orig, ordered=True)
  892. cat.categories.rename(None, inplace=True)
  893. expected[col] = cat
  894. with tm.ensure_clean() as path:
  895. original.to_stata(path, version=version)
  896. written_and_read_again = self.read_dta(path)
  897. res = written_and_read_again.set_index("index")
  898. tm.assert_frame_equal(res, expected)
  899. def test_categorical_warnings_and_errors(self):
  900. # Warning for non-string labels
  901. # Error for labels too long
  902. original = DataFrame.from_records(
  903. [["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]],
  904. columns=["Too_long"],
  905. )
  906. original = pd.concat(
  907. [original[col].astype("category") for col in original], axis=1
  908. )
  909. with tm.ensure_clean() as path:
  910. msg = (
  911. "Stata value labels for a single variable must have "
  912. r"a combined length less than 32,000 characters\."
  913. )
  914. with pytest.raises(ValueError, match=msg):
  915. original.to_stata(path)
  916. original = DataFrame.from_records(
  917. [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"]
  918. )
  919. original = pd.concat(
  920. [original[col].astype("category") for col in original], axis=1
  921. )
  922. with tm.assert_produces_warning(ValueLabelTypeMismatch):
  923. original.to_stata(path)
  924. # should get a warning for mixed content
  925. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  926. def test_categorical_with_stata_missing_values(self, version):
  927. values = [["a" + str(i)] for i in range(120)]
  928. values.append([np.nan])
  929. original = DataFrame.from_records(values, columns=["many_labels"])
  930. original = pd.concat(
  931. [original[col].astype("category") for col in original], axis=1
  932. )
  933. original.index.name = "index"
  934. with tm.ensure_clean() as path:
  935. original.to_stata(path, version=version)
  936. written_and_read_again = self.read_dta(path)
  937. res = written_and_read_again.set_index("index")
  938. expected = original.copy()
  939. for col in expected:
  940. cat = expected[col]._values
  941. new_cats = cat.remove_unused_categories().categories
  942. cat = cat.set_categories(new_cats, ordered=True)
  943. expected[col] = cat
  944. expected.index = expected.index.astype(np.int32)
  945. tm.assert_frame_equal(res, expected)
  946. @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
  947. def test_categorical_order(self, file, datapath):
  948. # Directly construct using expected codes
  949. # Format is is_cat, col_name, labels (in order), underlying data
  950. expected = [
  951. (True, "ordered", ["a", "b", "c", "d", "e"], np.arange(5)),
  952. (True, "reverse", ["a", "b", "c", "d", "e"], np.arange(5)[::-1]),
  953. (True, "noorder", ["a", "b", "c", "d", "e"], np.array([2, 1, 4, 0, 3])),
  954. (True, "floating", ["a", "b", "c", "d", "e"], np.arange(0, 5)),
  955. (True, "float_missing", ["a", "d", "e"], np.array([0, 1, 2, -1, -1])),
  956. (False, "nolabel", [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
  957. (True, "int32_mixed", ["d", 2, "e", "b", "a"], np.arange(5)),
  958. ]
  959. cols = []
  960. for is_cat, col, labels, codes in expected:
  961. if is_cat:
  962. cols.append(
  963. (col, pd.Categorical.from_codes(codes, labels, ordered=True))
  964. )
  965. else:
  966. cols.append((col, Series(labels, dtype=np.float32)))
  967. expected = DataFrame.from_dict(dict(cols))
  968. # Read with and with out categoricals, ensure order is identical
  969. file = datapath("io", "data", "stata", f"{file}.dta")
  970. parsed = read_stata(file)
  971. tm.assert_frame_equal(expected, parsed)
  972. # Check identity of codes
  973. for col in expected:
  974. if isinstance(expected[col].dtype, CategoricalDtype):
  975. tm.assert_series_equal(expected[col].cat.codes, parsed[col].cat.codes)
  976. tm.assert_index_equal(
  977. expected[col].cat.categories, parsed[col].cat.categories
  978. )
  979. @pytest.mark.parametrize("file", ["stata11_115", "stata11_117"])
  980. def test_categorical_sorting(self, file, datapath):
  981. parsed = read_stata(datapath("io", "data", "stata", f"{file}.dta"))
  982. # Sort based on codes, not strings
  983. parsed = parsed.sort_values("srh", na_position="first")
  984. # Don't sort index
  985. parsed.index = pd.RangeIndex(len(parsed))
  986. codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
  987. categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
  988. cat = pd.Categorical.from_codes(
  989. codes=codes, categories=categories, ordered=True
  990. )
  991. expected = Series(cat, name="srh")
  992. tm.assert_series_equal(expected, parsed["srh"])
  993. @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
  994. def test_categorical_ordering(self, file, datapath):
  995. file = datapath("io", "data", "stata", f"{file}.dta")
  996. parsed = read_stata(file)
  997. parsed_unordered = read_stata(file, order_categoricals=False)
  998. for col in parsed:
  999. if not isinstance(parsed[col].dtype, CategoricalDtype):
  1000. continue
  1001. assert parsed[col].cat.ordered
  1002. assert not parsed_unordered[col].cat.ordered
  1003. @pytest.mark.filterwarnings("ignore::UserWarning")
  1004. @pytest.mark.parametrize(
  1005. "file",
  1006. [
  1007. "stata1_117",
  1008. "stata2_117",
  1009. "stata3_117",
  1010. "stata4_117",
  1011. "stata5_117",
  1012. "stata6_117",
  1013. "stata7_117",
  1014. "stata8_117",
  1015. "stata9_117",
  1016. "stata10_117",
  1017. "stata11_117",
  1018. ],
  1019. )
  1020. @pytest.mark.parametrize("chunksize", [1, 2])
  1021. @pytest.mark.parametrize("convert_categoricals", [False, True])
  1022. @pytest.mark.parametrize("convert_dates", [False, True])
  1023. def test_read_chunks_117(
  1024. self, file, chunksize, convert_categoricals, convert_dates, datapath
  1025. ):
  1026. fname = datapath("io", "data", "stata", f"{file}.dta")
  1027. parsed = read_stata(
  1028. fname,
  1029. convert_categoricals=convert_categoricals,
  1030. convert_dates=convert_dates,
  1031. )
  1032. with read_stata(
  1033. fname,
  1034. iterator=True,
  1035. convert_categoricals=convert_categoricals,
  1036. convert_dates=convert_dates,
  1037. ) as itr:
  1038. pos = 0
  1039. for j in range(5):
  1040. try:
  1041. chunk = itr.read(chunksize)
  1042. except StopIteration:
  1043. break
  1044. from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
  1045. from_frame = self._convert_categorical(from_frame)
  1046. tm.assert_frame_equal(
  1047. from_frame, chunk, check_dtype=False, check_datetimelike_compat=True
  1048. )
  1049. pos += chunksize
  1050. @staticmethod
  1051. def _convert_categorical(from_frame: DataFrame) -> DataFrame:
  1052. """
  1053. Emulate the categorical casting behavior we expect from roundtripping.
  1054. """
  1055. for col in from_frame:
  1056. ser = from_frame[col]
  1057. if isinstance(ser.dtype, CategoricalDtype):
  1058. cat = ser._values.remove_unused_categories()
  1059. if cat.categories.dtype == object:
  1060. categories = pd.Index._with_infer(cat.categories._values)
  1061. cat = cat.set_categories(categories)
  1062. elif cat.categories.dtype == "string" and len(cat.categories) == 0:
  1063. # if the read categories are empty, it comes back as object dtype
  1064. categories = cat.categories.astype(object)
  1065. cat = cat.set_categories(categories)
  1066. from_frame[col] = cat
  1067. return from_frame
  1068. def test_iterator(self, datapath):
  1069. fname = datapath("io", "data", "stata", "stata3_117.dta")
  1070. parsed = read_stata(fname)
  1071. with read_stata(fname, iterator=True) as itr:
  1072. chunk = itr.read(5)
  1073. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
  1074. with read_stata(fname, chunksize=5) as itr:
  1075. chunk = list(itr)
  1076. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])
  1077. with read_stata(fname, iterator=True) as itr:
  1078. chunk = itr.get_chunk(5)
  1079. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
  1080. with read_stata(fname, chunksize=5) as itr:
  1081. chunk = itr.get_chunk()
  1082. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
  1083. # GH12153
  1084. with read_stata(fname, chunksize=4) as itr:
  1085. from_chunks = pd.concat(itr)
  1086. tm.assert_frame_equal(parsed, from_chunks)
  1087. @pytest.mark.filterwarnings("ignore::UserWarning")
  1088. @pytest.mark.parametrize(
  1089. "file",
  1090. [
  1091. "stata2_115",
  1092. "stata3_115",
  1093. "stata4_115",
  1094. "stata5_115",
  1095. "stata6_115",
  1096. "stata7_115",
  1097. "stata8_115",
  1098. "stata9_115",
  1099. "stata10_115",
  1100. "stata11_115",
  1101. ],
  1102. )
  1103. @pytest.mark.parametrize("chunksize", [1, 2])
  1104. @pytest.mark.parametrize("convert_categoricals", [False, True])
  1105. @pytest.mark.parametrize("convert_dates", [False, True])
  1106. def test_read_chunks_115(
  1107. self, file, chunksize, convert_categoricals, convert_dates, datapath
  1108. ):
  1109. fname = datapath("io", "data", "stata", f"{file}.dta")
  1110. # Read the whole file
  1111. parsed = read_stata(
  1112. fname,
  1113. convert_categoricals=convert_categoricals,
  1114. convert_dates=convert_dates,
  1115. )
  1116. # Compare to what we get when reading by chunk
  1117. with read_stata(
  1118. fname,
  1119. iterator=True,
  1120. convert_dates=convert_dates,
  1121. convert_categoricals=convert_categoricals,
  1122. ) as itr:
  1123. pos = 0
  1124. for j in range(5):
  1125. try:
  1126. chunk = itr.read(chunksize)
  1127. except StopIteration:
  1128. break
  1129. from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
  1130. from_frame = self._convert_categorical(from_frame)
  1131. tm.assert_frame_equal(
  1132. from_frame, chunk, check_dtype=False, check_datetimelike_compat=True
  1133. )
  1134. pos += chunksize
  1135. def test_read_chunks_columns(self, datapath):
  1136. fname = datapath("io", "data", "stata", "stata3_117.dta")
  1137. columns = ["quarter", "cpi", "m1"]
  1138. chunksize = 2
  1139. parsed = read_stata(fname, columns=columns)
  1140. with read_stata(fname, iterator=True) as itr:
  1141. pos = 0
  1142. for j in range(5):
  1143. chunk = itr.read(chunksize, columns=columns)
  1144. if chunk is None:
  1145. break
  1146. from_frame = parsed.iloc[pos : pos + chunksize, :]
  1147. tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
  1148. pos += chunksize
  1149. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1150. def test_write_variable_labels(self, version, mixed_frame):
  1151. # GH 13631, add support for writing variable labels
  1152. mixed_frame.index.name = "index"
  1153. variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"}
  1154. with tm.ensure_clean() as path:
  1155. mixed_frame.to_stata(path, variable_labels=variable_labels, version=version)
  1156. with StataReader(path) as sr:
  1157. read_labels = sr.variable_labels()
  1158. expected_labels = {
  1159. "index": "",
  1160. "a": "City Rank",
  1161. "b": "City Exponent",
  1162. "c": "City",
  1163. }
  1164. assert read_labels == expected_labels
  1165. variable_labels["index"] = "The Index"
  1166. with tm.ensure_clean() as path:
  1167. mixed_frame.to_stata(path, variable_labels=variable_labels, version=version)
  1168. with StataReader(path) as sr:
  1169. read_labels = sr.variable_labels()
  1170. assert read_labels == variable_labels
  1171. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1172. def test_invalid_variable_labels(self, version, mixed_frame):
  1173. mixed_frame.index.name = "index"
  1174. variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"}
  1175. with tm.ensure_clean() as path:
  1176. msg = "Variable labels must be 80 characters or fewer"
  1177. with pytest.raises(ValueError, match=msg):
  1178. mixed_frame.to_stata(
  1179. path, variable_labels=variable_labels, version=version
  1180. )
  1181. @pytest.mark.parametrize("version", [114, 117])
  1182. def test_invalid_variable_label_encoding(self, version, mixed_frame):
  1183. mixed_frame.index.name = "index"
  1184. variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"}
  1185. variable_labels["a"] = "invalid character Œ"
  1186. with tm.ensure_clean() as path:
  1187. with pytest.raises(
  1188. ValueError, match="Variable labels must contain only characters"
  1189. ):
  1190. mixed_frame.to_stata(
  1191. path, variable_labels=variable_labels, version=version
  1192. )
  1193. def test_write_variable_label_errors(self, mixed_frame):
  1194. values = ["\u03A1", "\u0391", "\u039D", "\u0394", "\u0391", "\u03A3"]
  1195. variable_labels_utf8 = {
  1196. "a": "City Rank",
  1197. "b": "City Exponent",
  1198. "c": "".join(values),
  1199. }
  1200. msg = (
  1201. "Variable labels must contain only characters that can be "
  1202. "encoded in Latin-1"
  1203. )
  1204. with pytest.raises(ValueError, match=msg):
  1205. with tm.ensure_clean() as path:
  1206. mixed_frame.to_stata(path, variable_labels=variable_labels_utf8)
  1207. variable_labels_long = {
  1208. "a": "City Rank",
  1209. "b": "City Exponent",
  1210. "c": "A very, very, very long variable label "
  1211. "that is too long for Stata which means "
  1212. "that it has more than 80 characters",
  1213. }
  1214. msg = "Variable labels must be 80 characters or fewer"
  1215. with pytest.raises(ValueError, match=msg):
  1216. with tm.ensure_clean() as path:
  1217. mixed_frame.to_stata(path, variable_labels=variable_labels_long)
  1218. def test_default_date_conversion(self):
  1219. # GH 12259
  1220. dates = [
  1221. dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
  1222. dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
  1223. dt.datetime(1776, 7, 4, 7, 4, 7, 4000),
  1224. ]
  1225. original = DataFrame(
  1226. {
  1227. "nums": [1.0, 2.0, 3.0],
  1228. "strs": ["apple", "banana", "cherry"],
  1229. "dates": dates,
  1230. }
  1231. )
  1232. with tm.ensure_clean() as path:
  1233. original.to_stata(path, write_index=False)
  1234. reread = read_stata(path, convert_dates=True)
  1235. tm.assert_frame_equal(original, reread)
  1236. original.to_stata(path, write_index=False, convert_dates={"dates": "tc"})
  1237. direct = read_stata(path, convert_dates=True)
  1238. tm.assert_frame_equal(reread, direct)
  1239. dates_idx = original.columns.tolist().index("dates")
  1240. original.to_stata(path, write_index=False, convert_dates={dates_idx: "tc"})
  1241. direct = read_stata(path, convert_dates=True)
  1242. tm.assert_frame_equal(reread, direct)
  1243. def test_unsupported_type(self):
  1244. original = DataFrame({"a": [1 + 2j, 2 + 4j]})
  1245. msg = "Data type complex128 not supported"
  1246. with pytest.raises(NotImplementedError, match=msg):
  1247. with tm.ensure_clean() as path:
  1248. original.to_stata(path)
  1249. def test_unsupported_datetype(self):
  1250. dates = [
  1251. dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
  1252. dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
  1253. dt.datetime(1776, 7, 4, 7, 4, 7, 4000),
  1254. ]
  1255. original = DataFrame(
  1256. {
  1257. "nums": [1.0, 2.0, 3.0],
  1258. "strs": ["apple", "banana", "cherry"],
  1259. "dates": dates,
  1260. }
  1261. )
  1262. msg = "Format %tC not implemented"
  1263. with pytest.raises(NotImplementedError, match=msg):
  1264. with tm.ensure_clean() as path:
  1265. original.to_stata(path, convert_dates={"dates": "tC"})
  1266. dates = pd.date_range("1-1-1990", periods=3, tz="Asia/Hong_Kong")
  1267. original = DataFrame(
  1268. {
  1269. "nums": [1.0, 2.0, 3.0],
  1270. "strs": ["apple", "banana", "cherry"],
  1271. "dates": dates,
  1272. }
  1273. )
  1274. with pytest.raises(NotImplementedError, match="Data type datetime64"):
  1275. with tm.ensure_clean() as path:
  1276. original.to_stata(path)
  1277. def test_repeated_column_labels(self, datapath):
  1278. # GH 13923, 25772
  1279. msg = """
  1280. Value labels for column ethnicsn are not unique. These cannot be converted to
  1281. pandas categoricals.
  1282. Either read the file with `convert_categoricals` set to False or use the
  1283. low level interface in `StataReader` to separately read the values and the
  1284. value_labels.
  1285. The repeated labels are:\n-+\nwolof
  1286. """
  1287. with pytest.raises(ValueError, match=msg):
  1288. read_stata(
  1289. datapath("io", "data", "stata", "stata15.dta"),
  1290. convert_categoricals=True,
  1291. )
  1292. def test_stata_111(self, datapath):
  1293. # 111 is an old version but still used by current versions of
  1294. # SAS when exporting to Stata format. We do not know of any
  1295. # on-line documentation for this version.
  1296. df = read_stata(datapath("io", "data", "stata", "stata7_111.dta"))
  1297. original = DataFrame(
  1298. {
  1299. "y": [1, 1, 1, 1, 1, 0, 0, np.nan, 0, 0],
  1300. "x": [1, 2, 1, 3, np.nan, 4, 3, 5, 1, 6],
  1301. "w": [2, np.nan, 5, 2, 4, 4, 3, 1, 2, 3],
  1302. "z": ["a", "b", "c", "d", "e", "", "g", "h", "i", "j"],
  1303. }
  1304. )
  1305. original = original[["y", "x", "w", "z"]]
  1306. tm.assert_frame_equal(original, df)
  1307. def test_out_of_range_double(self):
  1308. # GH 14618
  1309. df = DataFrame(
  1310. {
  1311. "ColumnOk": [0.0, np.finfo(np.double).eps, 4.49423283715579e307],
  1312. "ColumnTooBig": [0.0, np.finfo(np.double).eps, np.finfo(np.double).max],
  1313. }
  1314. )
  1315. msg = (
  1316. r"Column ColumnTooBig has a maximum value \(.+\) outside the range "
  1317. r"supported by Stata \(.+\)"
  1318. )
  1319. with pytest.raises(ValueError, match=msg):
  1320. with tm.ensure_clean() as path:
  1321. df.to_stata(path)
  1322. def test_out_of_range_float(self):
  1323. original = DataFrame(
  1324. {
  1325. "ColumnOk": [
  1326. 0.0,
  1327. np.finfo(np.float32).eps,
  1328. np.finfo(np.float32).max / 10.0,
  1329. ],
  1330. "ColumnTooBig": [
  1331. 0.0,
  1332. np.finfo(np.float32).eps,
  1333. np.finfo(np.float32).max,
  1334. ],
  1335. }
  1336. )
  1337. original.index.name = "index"
  1338. for col in original:
  1339. original[col] = original[col].astype(np.float32)
  1340. with tm.ensure_clean() as path:
  1341. original.to_stata(path)
  1342. reread = read_stata(path)
  1343. original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64)
  1344. expected = original.copy()
  1345. expected.index = expected.index.astype(np.int32)
  1346. tm.assert_frame_equal(reread.set_index("index"), expected)
  1347. @pytest.mark.parametrize("infval", [np.inf, -np.inf])
  1348. def test_inf(self, infval):
  1349. # GH 45350
  1350. df = DataFrame({"WithoutInf": [0.0, 1.0], "WithInf": [2.0, infval]})
  1351. msg = (
  1352. "Column WithInf contains infinity or -infinity"
  1353. "which is outside the range supported by Stata."
  1354. )
  1355. with pytest.raises(ValueError, match=msg):
  1356. with tm.ensure_clean() as path:
  1357. df.to_stata(path)
  1358. def test_path_pathlib(self):
  1359. df = DataFrame(
  1360. 1.1 * np.arange(120).reshape((30, 4)),
  1361. columns=pd.Index(list("ABCD")),
  1362. index=pd.Index([f"i-{i}" for i in range(30)]),
  1363. )
  1364. df.index.name = "index"
  1365. reader = lambda x: read_stata(x).set_index("index")
  1366. result = tm.round_trip_pathlib(df.to_stata, reader)
  1367. tm.assert_frame_equal(df, result)
  1368. def test_pickle_path_localpath(self):
  1369. df = DataFrame(
  1370. 1.1 * np.arange(120).reshape((30, 4)),
  1371. columns=pd.Index(list("ABCD")),
  1372. index=pd.Index([f"i-{i}" for i in range(30)]),
  1373. )
  1374. df.index.name = "index"
  1375. reader = lambda x: read_stata(x).set_index("index")
  1376. result = tm.round_trip_localpath(df.to_stata, reader)
  1377. tm.assert_frame_equal(df, result)
  1378. @pytest.mark.parametrize("write_index", [True, False])
  1379. def test_value_labels_iterator(self, write_index):
  1380. # GH 16923
  1381. d = {"A": ["B", "E", "C", "A", "E"]}
  1382. df = DataFrame(data=d)
  1383. df["A"] = df["A"].astype("category")
  1384. with tm.ensure_clean() as path:
  1385. df.to_stata(path, write_index=write_index)
  1386. with read_stata(path, iterator=True) as dta_iter:
  1387. value_labels = dta_iter.value_labels()
  1388. assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}}
  1389. def test_set_index(self):
  1390. # GH 17328
  1391. df = DataFrame(
  1392. 1.1 * np.arange(120).reshape((30, 4)),
  1393. columns=pd.Index(list("ABCD")),
  1394. index=pd.Index([f"i-{i}" for i in range(30)]),
  1395. )
  1396. df.index.name = "index"
  1397. with tm.ensure_clean() as path:
  1398. df.to_stata(path)
  1399. reread = read_stata(path, index_col="index")
  1400. tm.assert_frame_equal(df, reread)
  1401. @pytest.mark.parametrize(
  1402. "column", ["ms", "day", "week", "month", "qtr", "half", "yr"]
  1403. )
  1404. def test_date_parsing_ignores_format_details(self, column, datapath):
  1405. # GH 17797
  1406. #
  1407. # Test that display formats are ignored when determining if a numeric
  1408. # column is a date value.
  1409. #
  1410. # All date types are stored as numbers and format associated with the
  1411. # column denotes both the type of the date and the display format.
  1412. #
  1413. # STATA supports 9 date types which each have distinct units. We test 7
  1414. # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
  1415. # accounts for leap seconds and %tb relies on STATAs business calendar.
  1416. df = read_stata(datapath("io", "data", "stata", "stata13_dates.dta"))
  1417. unformatted = df.loc[0, column]
  1418. formatted = df.loc[0, column + "_fmt"]
  1419. assert unformatted == formatted
  1420. def test_writer_117(self, using_infer_string):
  1421. original = DataFrame(
  1422. data=[
  1423. [
  1424. "string",
  1425. "object",
  1426. 1,
  1427. 1,
  1428. 1,
  1429. 1.1,
  1430. 1.1,
  1431. np.datetime64("2003-12-25"),
  1432. "a",
  1433. "a" * 2045,
  1434. "a" * 5000,
  1435. "a",
  1436. ],
  1437. [
  1438. "string-1",
  1439. "object-1",
  1440. 1,
  1441. 1,
  1442. 1,
  1443. 1.1,
  1444. 1.1,
  1445. np.datetime64("2003-12-26"),
  1446. "b",
  1447. "b" * 2045,
  1448. "",
  1449. "",
  1450. ],
  1451. ],
  1452. columns=[
  1453. "string",
  1454. "object",
  1455. "int8",
  1456. "int16",
  1457. "int32",
  1458. "float32",
  1459. "float64",
  1460. "datetime",
  1461. "s1",
  1462. "s2045",
  1463. "srtl",
  1464. "forced_strl",
  1465. ],
  1466. )
  1467. original["object"] = Series(original["object"], dtype=object)
  1468. original["int8"] = Series(original["int8"], dtype=np.int8)
  1469. original["int16"] = Series(original["int16"], dtype=np.int16)
  1470. original["int32"] = original["int32"].astype(np.int32)
  1471. original["float32"] = Series(original["float32"], dtype=np.float32)
  1472. original.index.name = "index"
  1473. original.index = original.index.astype(np.int32)
  1474. copy = original.copy()
  1475. with tm.ensure_clean() as path:
  1476. original.to_stata(
  1477. path,
  1478. convert_dates={"datetime": "tc"},
  1479. convert_strl=["forced_strl"],
  1480. version=117,
  1481. )
  1482. written_and_read_again = self.read_dta(path)
  1483. expected = original[:]
  1484. if using_infer_string:
  1485. # object dtype (with only strings/None) comes back as string dtype
  1486. expected["object"] = expected["object"].astype("str")
  1487. tm.assert_frame_equal(
  1488. written_and_read_again.set_index("index"),
  1489. expected,
  1490. )
  1491. tm.assert_frame_equal(original, copy)
  1492. def test_convert_strl_name_swap(self):
  1493. original = DataFrame(
  1494. [["a" * 3000, "A", "apple"], ["b" * 1000, "B", "banana"]],
  1495. columns=["long1" * 10, "long", 1],
  1496. )
  1497. original.index.name = "index"
  1498. with tm.assert_produces_warning(InvalidColumnName):
  1499. with tm.ensure_clean() as path:
  1500. original.to_stata(path, convert_strl=["long", 1], version=117)
  1501. reread = self.read_dta(path)
  1502. reread = reread.set_index("index")
  1503. reread.columns = original.columns
  1504. tm.assert_frame_equal(reread, original, check_index_type=False)
  1505. def test_invalid_date_conversion(self):
  1506. # GH 12259
  1507. dates = [
  1508. dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
  1509. dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
  1510. dt.datetime(1776, 7, 4, 7, 4, 7, 4000),
  1511. ]
  1512. original = DataFrame(
  1513. {
  1514. "nums": [1.0, 2.0, 3.0],
  1515. "strs": ["apple", "banana", "cherry"],
  1516. "dates": dates,
  1517. }
  1518. )
  1519. with tm.ensure_clean() as path:
  1520. msg = "convert_dates key must be a column or an integer"
  1521. with pytest.raises(ValueError, match=msg):
  1522. original.to_stata(path, convert_dates={"wrong_name": "tc"})
  1523. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1524. def test_nonfile_writing(self, version):
  1525. # GH 21041
  1526. bio = io.BytesIO()
  1527. df = DataFrame(
  1528. 1.1 * np.arange(120).reshape((30, 4)),
  1529. columns=pd.Index(list("ABCD")),
  1530. index=pd.Index([f"i-{i}" for i in range(30)]),
  1531. )
  1532. df.index.name = "index"
  1533. with tm.ensure_clean() as path:
  1534. df.to_stata(bio, version=version)
  1535. bio.seek(0)
  1536. with open(path, "wb") as dta:
  1537. dta.write(bio.read())
  1538. reread = read_stata(path, index_col="index")
  1539. tm.assert_frame_equal(df, reread)
  1540. def test_gzip_writing(self):
  1541. # writing version 117 requires seek and cannot be used with gzip
  1542. df = DataFrame(
  1543. 1.1 * np.arange(120).reshape((30, 4)),
  1544. columns=pd.Index(list("ABCD")),
  1545. index=pd.Index([f"i-{i}" for i in range(30)]),
  1546. )
  1547. df.index.name = "index"
  1548. with tm.ensure_clean() as path:
  1549. with gzip.GzipFile(path, "wb") as gz:
  1550. df.to_stata(gz, version=114)
  1551. with gzip.GzipFile(path, "rb") as gz:
  1552. reread = read_stata(gz, index_col="index")
  1553. tm.assert_frame_equal(df, reread)
  1554. def test_unicode_dta_118(self, datapath):
  1555. unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta"))
  1556. columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"]
  1557. values = [
  1558. ["ραηδας", "PÄNDÄS", "p", "ραηδας", "p"],
  1559. ["ƤĀńĐąŜ", "Ö", "a", "ƤĀńĐąŜ", "a"],
  1560. ["ᴘᴀᴎᴅᴀS", "Ü", "n", "ᴘᴀᴎᴅᴀS", "n"],
  1561. [" ", " ", "d", " ", "d"],
  1562. [" ", "", "a", " ", "a"],
  1563. ["", "", "s", "", "s"],
  1564. ["", "", " ", "", " "],
  1565. ]
  1566. expected = DataFrame(values, columns=columns)
  1567. tm.assert_frame_equal(unicode_df, expected)
  1568. def test_mixed_string_strl(self, using_infer_string):
  1569. # GH 23633
  1570. output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}]
  1571. output = DataFrame(output)
  1572. output.number = output.number.astype("int32")
  1573. with tm.ensure_clean() as path:
  1574. output.to_stata(path, write_index=False, version=117)
  1575. reread = read_stata(path)
  1576. expected = output.fillna("")
  1577. tm.assert_frame_equal(reread, expected)
  1578. # Check strl supports all None (null)
  1579. output["mixed"] = None
  1580. output.to_stata(
  1581. path, write_index=False, convert_strl=["mixed"], version=117
  1582. )
  1583. reread = read_stata(path)
  1584. expected = output.copy()
  1585. if using_infer_string:
  1586. expected["mixed"] = expected["mixed"].astype("str")
  1587. expected = expected.fillna("")
  1588. tm.assert_frame_equal(reread, expected)
  1589. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1590. def test_all_none_exception(self, version):
  1591. output = [{"none": "none", "number": 0}, {"none": None, "number": 1}]
  1592. output = DataFrame(output)
  1593. output["none"] = None
  1594. with tm.ensure_clean() as path:
  1595. with pytest.raises(ValueError, match="Column `none` cannot be exported"):
  1596. output.to_stata(path, version=version)
  1597. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1598. def test_invalid_file_not_written(self, version):
  1599. content = "Here is one __�__ Another one __·__ Another one __½__"
  1600. df = DataFrame([content], columns=["invalid"])
  1601. with tm.ensure_clean() as path:
  1602. msg1 = (
  1603. r"'latin-1' codec can't encode character '\\ufffd' "
  1604. r"in position 14: ordinal not in range\(256\)"
  1605. )
  1606. msg2 = (
  1607. "'ascii' codec can't decode byte 0xef in position 14: "
  1608. r"ordinal not in range\(128\)"
  1609. )
  1610. with pytest.raises(UnicodeEncodeError, match=f"{msg1}|{msg2}"):
  1611. df.to_stata(path)
  1612. def test_strl_latin1(self):
  1613. # GH 23573, correct GSO data to reflect correct size
  1614. output = DataFrame(
  1615. [["pandas"] * 2, ["þâÑÐŧ"] * 2], columns=["var_str", "var_strl"]
  1616. )
  1617. with tm.ensure_clean() as path:
  1618. output.to_stata(path, version=117, convert_strl=["var_strl"])
  1619. with open(path, "rb") as reread:
  1620. content = reread.read()
  1621. expected = "þâÑÐŧ"
  1622. assert expected.encode("latin-1") in content
  1623. assert expected.encode("utf-8") in content
  1624. gsos = content.split(b"strls")[1][1:-2]
  1625. for gso in gsos.split(b"GSO")[1:]:
  1626. val = gso.split(b"\x00")[-2]
  1627. size = gso[gso.find(b"\x82") + 1]
  1628. assert len(val) == size - 1
  1629. def test_encoding_latin1_118(self, datapath):
  1630. # GH 25960
  1631. msg = """
  1632. One or more strings in the dta file could not be decoded using utf-8, and
  1633. so the fallback encoding of latin-1 is being used. This can happen when a file
  1634. has been incorrectly encoded by Stata or some other software. You should verify
  1635. the string values returned are correct."""
  1636. # Move path outside of read_stata, or else assert_produces_warning
  1637. # will block pytests skip mechanism from triggering (failing the test)
  1638. # if the path is not present
  1639. path = datapath("io", "data", "stata", "stata1_encoding_118.dta")
  1640. with tm.assert_produces_warning(UnicodeWarning, filter_level="once") as w:
  1641. encoded = read_stata(path)
  1642. # with filter_level="always", produces 151 warnings which can be slow
  1643. assert len(w) == 1
  1644. assert w[0].message.args[0] == msg
  1645. expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
  1646. tm.assert_frame_equal(encoded, expected)
  1647. @pytest.mark.slow
  1648. def test_stata_119(self, datapath):
  1649. # Gzipped since contains 32,999 variables and uncompressed is 20MiB
  1650. # Just validate that the reader reports correct number of variables
  1651. # to avoid high peak memory
  1652. with gzip.open(
  1653. datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb"
  1654. ) as gz:
  1655. with StataReader(gz) as reader:
  1656. reader._ensure_open()
  1657. assert reader._nvar == 32999
  1658. @pytest.mark.filterwarnings("ignore:Downcasting behavior:FutureWarning")
  1659. @pytest.mark.parametrize("version", [118, 119, None])
  1660. def test_utf8_writer(self, version):
  1661. cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
  1662. data = DataFrame(
  1663. [
  1664. [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"],
  1665. [2.0, 2, "ᴮ", ""],
  1666. [3.0, 3, "ᴰ", None],
  1667. ],
  1668. columns=["Å", "β", "ĉ", "strls"],
  1669. )
  1670. data["ᴐᴬᵀ"] = cat
  1671. variable_labels = {
  1672. "Å": "apple",
  1673. "β": "ᵈᵉᵊ",
  1674. "ĉ": "ᴎტჄႲႳႴႶႺ",
  1675. "strls": "Long Strings",
  1676. "ᴐᴬᵀ": "",
  1677. }
  1678. data_label = "ᴅaᵀa-label"
  1679. value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}}
  1680. data["β"] = data["β"].astype(np.int32)
  1681. with tm.ensure_clean() as path:
  1682. writer = StataWriterUTF8(
  1683. path,
  1684. data,
  1685. data_label=data_label,
  1686. convert_strl=["strls"],
  1687. variable_labels=variable_labels,
  1688. write_index=False,
  1689. version=version,
  1690. value_labels=value_labels,
  1691. )
  1692. writer.write_file()
  1693. reread_encoded = read_stata(path)
  1694. # Missing is intentionally converted to empty strl
  1695. data["strls"] = data["strls"].fillna("")
  1696. # Variable with value labels is reread as categorical
  1697. data["β"] = (
  1698. data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered()
  1699. )
  1700. tm.assert_frame_equal(data, reread_encoded)
  1701. with StataReader(path) as reader:
  1702. assert reader.data_label == data_label
  1703. assert reader.variable_labels() == variable_labels
  1704. data.to_stata(path, version=version, write_index=False)
  1705. reread_to_stata = read_stata(path)
  1706. tm.assert_frame_equal(data, reread_to_stata)
  1707. def test_writer_118_exceptions(self):
  1708. df = DataFrame(np.zeros((1, 33000), dtype=np.int8))
  1709. with tm.ensure_clean() as path:
  1710. with pytest.raises(ValueError, match="version must be either 118 or 119."):
  1711. StataWriterUTF8(path, df, version=117)
  1712. with tm.ensure_clean() as path:
  1713. with pytest.raises(ValueError, match="You must use version 119"):
  1714. StataWriterUTF8(path, df, version=118)
  1715. @pytest.mark.parametrize(
  1716. "dtype_backend",
  1717. ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))],
  1718. )
  1719. def test_read_write_ea_dtypes(self, dtype_backend):
  1720. df = DataFrame(
  1721. {
  1722. "a": [1, 2, None],
  1723. "b": ["a", "b", "c"],
  1724. "c": [True, False, None],
  1725. "d": [1.5, 2.5, 3.5],
  1726. "e": pd.date_range("2020-12-31", periods=3, freq="D"),
  1727. },
  1728. index=pd.Index([0, 1, 2], name="index"),
  1729. )
  1730. df = df.convert_dtypes(dtype_backend=dtype_backend)
  1731. df.to_stata("test_stata.dta", version=118)
  1732. with tm.ensure_clean() as path:
  1733. df.to_stata(path)
  1734. written_and_read_again = self.read_dta(path)
  1735. expected = DataFrame(
  1736. {
  1737. "a": [1, 2, np.nan],
  1738. "b": ["a", "b", "c"],
  1739. "c": [1.0, 0, np.nan],
  1740. "d": [1.5, 2.5, 3.5],
  1741. "e": pd.date_range("2020-12-31", periods=3, freq="D"),
  1742. },
  1743. index=pd.Index([0, 1, 2], name="index", dtype=np.int32),
  1744. )
  1745. tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
  1746. @pytest.mark.parametrize("version", [105, 108, 111, 113, 114])
  1747. def test_backward_compat(version, datapath):
  1748. data_base = datapath("io", "data", "stata")
  1749. ref = os.path.join(data_base, "stata-compat-118.dta")
  1750. old = os.path.join(data_base, f"stata-compat-{version}.dta")
  1751. expected = read_stata(ref)
  1752. old_dta = read_stata(old)
  1753. tm.assert_frame_equal(old_dta, expected, check_dtype=False)
  1754. def test_direct_read(datapath, monkeypatch):
  1755. file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
  1756. # Test that opening a file path doesn't buffer the file.
  1757. with StataReader(file_path) as reader:
  1758. # Must not have been buffered to memory
  1759. assert not reader.read().empty
  1760. assert not isinstance(reader._path_or_buf, io.BytesIO)
  1761. # Test that we use a given fp exactly, if possible.
  1762. with open(file_path, "rb") as fp:
  1763. with StataReader(fp) as reader:
  1764. assert not reader.read().empty
  1765. assert reader._path_or_buf is fp
  1766. # Test that we use a given BytesIO exactly, if possible.
  1767. with open(file_path, "rb") as fp:
  1768. with io.BytesIO(fp.read()) as bio:
  1769. with StataReader(bio) as reader:
  1770. assert not reader.read().empty
  1771. assert reader._path_or_buf is bio
  1772. def test_statareader_warns_when_used_without_context(datapath):
  1773. file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
  1774. with tm.assert_produces_warning(
  1775. ResourceWarning,
  1776. match="without using a context manager",
  1777. ):
  1778. sr = StataReader(file_path)
  1779. sr.read()
  1780. with tm.assert_produces_warning(
  1781. FutureWarning,
  1782. match="is not part of the public API",
  1783. ):
  1784. sr.close()
  1785. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1786. @pytest.mark.parametrize("use_dict", [True, False])
  1787. @pytest.mark.parametrize("infer", [True, False])
  1788. def test_compression(compression, version, use_dict, infer, compression_to_extension):
  1789. file_name = "dta_inferred_compression.dta"
  1790. if compression:
  1791. if use_dict:
  1792. file_ext = compression
  1793. else:
  1794. file_ext = compression_to_extension[compression]
  1795. file_name += f".{file_ext}"
  1796. compression_arg = compression
  1797. if infer:
  1798. compression_arg = "infer"
  1799. if use_dict:
  1800. compression_arg = {"method": compression}
  1801. df = DataFrame(
  1802. np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
  1803. )
  1804. df.index.name = "index"
  1805. with tm.ensure_clean(file_name) as path:
  1806. df.to_stata(path, version=version, compression=compression_arg)
  1807. if compression == "gzip":
  1808. with gzip.open(path, "rb") as comp:
  1809. fp = io.BytesIO(comp.read())
  1810. elif compression == "zip":
  1811. with zipfile.ZipFile(path, "r") as comp:
  1812. fp = io.BytesIO(comp.read(comp.filelist[0]))
  1813. elif compression == "tar":
  1814. with tarfile.open(path) as tar:
  1815. fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read())
  1816. elif compression == "bz2":
  1817. with bz2.open(path, "rb") as comp:
  1818. fp = io.BytesIO(comp.read())
  1819. elif compression == "zstd":
  1820. zstd = pytest.importorskip("zstandard")
  1821. with zstd.open(path, "rb") as comp:
  1822. fp = io.BytesIO(comp.read())
  1823. elif compression == "xz":
  1824. lzma = pytest.importorskip("lzma")
  1825. with lzma.open(path, "rb") as comp:
  1826. fp = io.BytesIO(comp.read())
  1827. elif compression is None:
  1828. fp = path
  1829. reread = read_stata(fp, index_col="index")
  1830. expected = df.copy()
  1831. expected.index = expected.index.astype(np.int32)
  1832. tm.assert_frame_equal(reread, expected)
  1833. @pytest.mark.parametrize("method", ["zip", "infer"])
  1834. @pytest.mark.parametrize("file_ext", [None, "dta", "zip"])
  1835. def test_compression_dict(method, file_ext):
  1836. file_name = f"test.{file_ext}"
  1837. archive_name = "test.dta"
  1838. df = DataFrame(
  1839. np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
  1840. )
  1841. df.index.name = "index"
  1842. with tm.ensure_clean(file_name) as path:
  1843. compression = {"method": method, "archive_name": archive_name}
  1844. df.to_stata(path, compression=compression)
  1845. if method == "zip" or file_ext == "zip":
  1846. with zipfile.ZipFile(path, "r") as zp:
  1847. assert len(zp.filelist) == 1
  1848. assert zp.filelist[0].filename == archive_name
  1849. fp = io.BytesIO(zp.read(zp.filelist[0]))
  1850. else:
  1851. fp = path
  1852. reread = read_stata(fp, index_col="index")
  1853. expected = df.copy()
  1854. expected.index = expected.index.astype(np.int32)
  1855. tm.assert_frame_equal(reread, expected)
  1856. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  1857. def test_chunked_categorical(version):
  1858. df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")})
  1859. df.index.name = "index"
  1860. expected = df.copy()
  1861. expected.index = expected.index.astype(np.int32)
  1862. with tm.ensure_clean() as path:
  1863. df.to_stata(path, version=version)
  1864. with StataReader(path, chunksize=2, order_categoricals=False) as reader:
  1865. for i, block in enumerate(reader):
  1866. block = block.set_index("index")
  1867. assert "cats" in block
  1868. tm.assert_series_equal(
  1869. block.cats, expected.cats.iloc[2 * i : 2 * (i + 1)]
  1870. )
  1871. def test_chunked_categorical_partial(datapath):
  1872. dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta")
  1873. values = ["a", "b", "a", "b", 3.0]
  1874. with StataReader(dta_file, chunksize=2) as reader:
  1875. with tm.assert_produces_warning(CategoricalConversionWarning):
  1876. for i, block in enumerate(reader):
  1877. assert list(block.cats) == values[2 * i : 2 * (i + 1)]
  1878. if i < 2:
  1879. idx = pd.Index(["a", "b"])
  1880. else:
  1881. idx = pd.Index([3.0], dtype="float64")
  1882. tm.assert_index_equal(block.cats.cat.categories, idx)
  1883. with tm.assert_produces_warning(CategoricalConversionWarning):
  1884. with StataReader(dta_file, chunksize=5) as reader:
  1885. large_chunk = reader.__next__()
  1886. direct = read_stata(dta_file)
  1887. tm.assert_frame_equal(direct, large_chunk)
  1888. @pytest.mark.parametrize("chunksize", (-1, 0, "apple"))
  1889. def test_iterator_errors(datapath, chunksize):
  1890. dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta")
  1891. with pytest.raises(ValueError, match="chunksize must be a positive"):
  1892. with StataReader(dta_file, chunksize=chunksize):
  1893. pass
  1894. def test_iterator_value_labels():
  1895. # GH 31544
  1896. values = ["c_label", "b_label"] + ["a_label"] * 500
  1897. df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
  1898. with tm.ensure_clean() as path:
  1899. df.to_stata(path, write_index=False)
  1900. expected = pd.Index(["a_label", "b_label", "c_label"])
  1901. with read_stata(path, chunksize=100) as reader:
  1902. for j, chunk in enumerate(reader):
  1903. for i in range(2):
  1904. tm.assert_index_equal(chunk.dtypes.iloc[i].categories, expected)
  1905. tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
  1906. def test_precision_loss():
  1907. df = DataFrame(
  1908. [[sum(2**i for i in range(60)), sum(2**i for i in range(52))]],
  1909. columns=["big", "little"],
  1910. )
  1911. with tm.ensure_clean() as path:
  1912. with tm.assert_produces_warning(
  1913. PossiblePrecisionLoss, match="Column converted from int64 to float64"
  1914. ):
  1915. df.to_stata(path, write_index=False)
  1916. reread = read_stata(path)
  1917. expected_dt = Series([np.float64, np.float64], index=["big", "little"])
  1918. tm.assert_series_equal(reread.dtypes, expected_dt)
  1919. assert reread.loc[0, "little"] == df.loc[0, "little"]
  1920. assert reread.loc[0, "big"] == float(df.loc[0, "big"])
  1921. def test_compression_roundtrip(compression):
  1922. df = DataFrame(
  1923. [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  1924. index=["A", "B"],
  1925. columns=["X", "Y", "Z"],
  1926. )
  1927. df.index.name = "index"
  1928. with tm.ensure_clean() as path:
  1929. df.to_stata(path, compression=compression)
  1930. reread = read_stata(path, compression=compression, index_col="index")
  1931. tm.assert_frame_equal(df, reread)
  1932. # explicitly ensure file was compressed.
  1933. with tm.decompress_file(path, compression) as fh:
  1934. contents = io.BytesIO(fh.read())
  1935. reread = read_stata(contents, index_col="index")
  1936. tm.assert_frame_equal(df, reread)
  1937. @pytest.mark.parametrize("to_infer", [True, False])
  1938. @pytest.mark.parametrize("read_infer", [True, False])
  1939. def test_stata_compression(
  1940. compression_only, read_infer, to_infer, compression_to_extension
  1941. ):
  1942. compression = compression_only
  1943. ext = compression_to_extension[compression]
  1944. filename = f"test.{ext}"
  1945. df = DataFrame(
  1946. [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  1947. index=["A", "B"],
  1948. columns=["X", "Y", "Z"],
  1949. )
  1950. df.index.name = "index"
  1951. to_compression = "infer" if to_infer else compression
  1952. read_compression = "infer" if read_infer else compression
  1953. with tm.ensure_clean(filename) as path:
  1954. df.to_stata(path, compression=to_compression)
  1955. result = read_stata(path, compression=read_compression, index_col="index")
  1956. tm.assert_frame_equal(result, df)
  1957. def test_non_categorical_value_labels():
  1958. data = DataFrame(
  1959. {
  1960. "fully_labelled": [1, 2, 3, 3, 1],
  1961. "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan],
  1962. "Y": [7, 7, 9, 8, 10],
  1963. "Z": pd.Categorical(["j", "k", "l", "k", "j"]),
  1964. }
  1965. )
  1966. with tm.ensure_clean() as path:
  1967. value_labels = {
  1968. "fully_labelled": {1: "one", 2: "two", 3: "three"},
  1969. "partially_labelled": {1.0: "one", 2.0: "two"},
  1970. }
  1971. expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}}
  1972. writer = StataWriter(path, data, value_labels=value_labels)
  1973. writer.write_file()
  1974. with StataReader(path) as reader:
  1975. reader_value_labels = reader.value_labels()
  1976. assert reader_value_labels == expected
  1977. msg = "Can't create value labels for notY, it wasn't found in the dataset."
  1978. with pytest.raises(KeyError, match=msg):
  1979. value_labels = {"notY": {7: "label1", 8: "label2"}}
  1980. StataWriter(path, data, value_labels=value_labels)
  1981. msg = (
  1982. "Can't create value labels for Z, value labels "
  1983. "can only be applied to numeric columns."
  1984. )
  1985. with pytest.raises(ValueError, match=msg):
  1986. value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}}
  1987. StataWriter(path, data, value_labels=value_labels)
  1988. def test_non_categorical_value_label_name_conversion():
  1989. # Check conversion of invalid variable names
  1990. data = DataFrame(
  1991. {
  1992. "invalid~!": [1, 1, 2, 3, 5, 8], # Only alphanumeric and _
  1993. "6_invalid": [1, 1, 2, 3, 5, 8], # Must start with letter or _
  1994. "invalid_name_longer_than_32_characters": [8, 8, 9, 9, 8, 8], # Too long
  1995. "aggregate": [2, 5, 5, 6, 6, 9], # Reserved words
  1996. (1, 2): [1, 2, 3, 4, 5, 6], # Hashable non-string
  1997. }
  1998. )
  1999. value_labels = {
  2000. "invalid~!": {1: "label1", 2: "label2"},
  2001. "6_invalid": {1: "label1", 2: "label2"},
  2002. "invalid_name_longer_than_32_characters": {8: "eight", 9: "nine"},
  2003. "aggregate": {5: "five"},
  2004. (1, 2): {3: "three"},
  2005. }
  2006. expected = {
  2007. "invalid__": {1: "label1", 2: "label2"},
  2008. "_6_invalid": {1: "label1", 2: "label2"},
  2009. "invalid_name_longer_than_32_char": {8: "eight", 9: "nine"},
  2010. "_aggregate": {5: "five"},
  2011. "_1__2_": {3: "three"},
  2012. }
  2013. with tm.ensure_clean() as path:
  2014. with tm.assert_produces_warning(InvalidColumnName):
  2015. data.to_stata(path, value_labels=value_labels)
  2016. with StataReader(path) as reader:
  2017. reader_value_labels = reader.value_labels()
  2018. assert reader_value_labels == expected
  2019. def test_non_categorical_value_label_convert_categoricals_error():
  2020. # Mapping more than one value to the same label is valid for Stata
  2021. # labels, but can't be read with convert_categoricals=True
  2022. value_labels = {
  2023. "repeated_labels": {10: "Ten", 20: "More than ten", 40: "More than ten"}
  2024. }
  2025. data = DataFrame(
  2026. {
  2027. "repeated_labels": [10, 10, 20, 20, 40, 40],
  2028. }
  2029. )
  2030. with tm.ensure_clean() as path:
  2031. data.to_stata(path, value_labels=value_labels)
  2032. with StataReader(path, convert_categoricals=False) as reader:
  2033. reader_value_labels = reader.value_labels()
  2034. assert reader_value_labels == value_labels
  2035. col = "repeated_labels"
  2036. repeats = "-" * 80 + "\n" + "\n".join(["More than ten"])
  2037. msg = f"""
  2038. Value labels for column {col} are not unique. These cannot be converted to
  2039. pandas categoricals.
  2040. Either read the file with `convert_categoricals` set to False or use the
  2041. low level interface in `StataReader` to separately read the values and the
  2042. value_labels.
  2043. The repeated labels are:
  2044. {repeats}
  2045. """
  2046. with pytest.raises(ValueError, match=msg):
  2047. read_stata(path, convert_categoricals=True)
  2048. @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
  2049. @pytest.mark.parametrize(
  2050. "dtype",
  2051. [
  2052. pd.BooleanDtype,
  2053. pd.Int8Dtype,
  2054. pd.Int16Dtype,
  2055. pd.Int32Dtype,
  2056. pd.Int64Dtype,
  2057. pd.UInt8Dtype,
  2058. pd.UInt16Dtype,
  2059. pd.UInt32Dtype,
  2060. pd.UInt64Dtype,
  2061. ],
  2062. )
  2063. def test_nullable_support(dtype, version):
  2064. df = DataFrame(
  2065. {
  2066. "a": Series([1.0, 2.0, 3.0]),
  2067. "b": Series([1, pd.NA, pd.NA], dtype=dtype.name),
  2068. "c": Series(["a", "b", None]),
  2069. }
  2070. )
  2071. dtype_name = df.b.dtype.numpy_dtype.name
  2072. # Only use supported names: no uint, bool or int64
  2073. dtype_name = dtype_name.replace("u", "")
  2074. if dtype_name == "int64":
  2075. dtype_name = "int32"
  2076. elif dtype_name == "bool":
  2077. dtype_name = "int8"
  2078. value = StataMissingValue.BASE_MISSING_VALUES[dtype_name]
  2079. smv = StataMissingValue(value)
  2080. expected_b = Series([1, smv, smv], dtype=object, name="b")
  2081. expected_c = Series(["a", "b", ""], name="c")
  2082. with tm.ensure_clean() as path:
  2083. df.to_stata(path, write_index=False, version=version)
  2084. reread = read_stata(path, convert_missing=True)
  2085. tm.assert_series_equal(df.a, reread.a)
  2086. tm.assert_series_equal(reread.b, expected_b)
  2087. tm.assert_series_equal(reread.c, expected_c)
  2088. def test_empty_frame():
  2089. # GH 46240
  2090. # create an empty DataFrame with int64 and float64 dtypes
  2091. df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0)
  2092. with tm.ensure_clean() as path:
  2093. df.to_stata(path, write_index=False, version=117)
  2094. # Read entire dataframe
  2095. df2 = read_stata(path)
  2096. assert "b" in df2
  2097. # Dtypes don't match since no support for int32
  2098. dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")})
  2099. tm.assert_series_equal(df2.dtypes, dtypes)
  2100. # read one column of empty .dta file
  2101. df3 = read_stata(path, columns=["a"])
  2102. assert "b" not in df3
  2103. tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])