sql.py 102 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960
  1. """
  2. Collection of query wrappers / abstractions to both facilitate data
  3. retrieval and to reduce dependency on DB-specific API.
  4. """
  5. from __future__ import annotations
  6. from abc import (
  7. ABC,
  8. abstractmethod,
  9. )
  10. from contextlib import (
  11. ExitStack,
  12. contextmanager,
  13. )
  14. from datetime import (
  15. date,
  16. datetime,
  17. time,
  18. )
  19. from functools import partial
  20. import re
  21. from typing import (
  22. TYPE_CHECKING,
  23. Any,
  24. Literal,
  25. Self,
  26. cast,
  27. overload,
  28. )
  29. import warnings
  30. import numpy as np
  31. from pandas._config import using_string_dtype
  32. from pandas._libs import lib
  33. from pandas.compat._optional import (
  34. VERSIONS,
  35. import_optional_dependency,
  36. )
  37. from pandas.errors import (
  38. AbstractMethodError,
  39. DatabaseError,
  40. )
  41. from pandas.util._decorators import set_module
  42. from pandas.util._exceptions import find_stack_level
  43. from pandas.util._validators import check_dtype_backend
  44. from pandas.core.dtypes.common import (
  45. is_dict_like,
  46. is_list_like,
  47. is_object_dtype,
  48. is_string_dtype,
  49. )
  50. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  51. from pandas.core.dtypes.missing import isna
  52. from pandas import get_option
  53. from pandas.core.api import (
  54. DataFrame,
  55. Series,
  56. )
  57. from pandas.core.arrays import ArrowExtensionArray
  58. from pandas.core.arrays.string_ import StringDtype
  59. from pandas.core.base import PandasObject
  60. import pandas.core.common as com
  61. from pandas.core.common import maybe_make_list
  62. from pandas.core.internals.construction import convert_object_array
  63. from pandas.core.tools.datetimes import to_datetime
  64. from pandas.io._util import arrow_table_to_pandas
  65. if TYPE_CHECKING:
  66. from collections.abc import (
  67. Callable,
  68. Generator,
  69. Iterator,
  70. Mapping,
  71. )
  72. from sqlalchemy import Table
  73. from sqlalchemy.sql.expression import (
  74. Delete,
  75. Select,
  76. TextClause,
  77. )
  78. from pandas._typing import (
  79. DtypeArg,
  80. DtypeBackend,
  81. IndexLabel,
  82. )
  83. from pandas import Index
  84. # -----------------------------------------------------------------------------
  85. # -- Helper functions
  86. def _process_parse_dates_argument(parse_dates):
  87. """Process parse_dates argument for read_sql functions"""
  88. # handle non-list entries for parse_dates gracefully
  89. if parse_dates is True or parse_dates is None or parse_dates is False:
  90. parse_dates = []
  91. elif not hasattr(parse_dates, "__iter__"):
  92. parse_dates = [parse_dates]
  93. return parse_dates
  94. def _handle_date_column(
  95. col, utc: bool = False, format: str | dict[str, Any] | None = None
  96. ):
  97. if isinstance(format, dict):
  98. # GH35185 Allow custom error values in parse_dates argument of
  99. # read_sql like functions.
  100. # Format can take on custom to_datetime argument values such as
  101. # {"errors": "coerce"} or {"dayfirst": True}
  102. return to_datetime(col, **format)
  103. else:
  104. # Allow passing of formatting string for integers
  105. # GH17855
  106. if format is None and (
  107. issubclass(col.dtype.type, np.floating)
  108. or issubclass(col.dtype.type, np.integer)
  109. ):
  110. format = "s"
  111. if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]:
  112. return to_datetime(col, errors="coerce", unit=format, utc=utc)
  113. elif isinstance(col.dtype, DatetimeTZDtype):
  114. # coerce to UTC timezone
  115. # GH11216
  116. return to_datetime(col, utc=True)
  117. else:
  118. return to_datetime(col, errors="coerce", format=format, utc=utc)
  119. def _parse_date_columns(data_frame: DataFrame, parse_dates) -> DataFrame:
  120. """
  121. Force non-datetime columns to be read as such.
  122. Supports both string formatted and integer timestamp columns.
  123. """
  124. parse_dates = _process_parse_dates_argument(parse_dates)
  125. # we want to coerce datetime64_tz dtypes for now to UTC
  126. # we could in theory do a 'nice' conversion from a FixedOffset tz
  127. # GH11216
  128. for i, (col_name, df_col) in enumerate(data_frame.items()):
  129. if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates:
  130. try:
  131. fmt = parse_dates[col_name]
  132. except (KeyError, TypeError):
  133. fmt = None
  134. data_frame.isetitem(i, _handle_date_column(df_col, format=fmt))
  135. return data_frame
  136. def _convert_arrays_to_dataframe(
  137. data,
  138. columns,
  139. coerce_float: bool = True,
  140. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  141. ) -> DataFrame:
  142. content = lib.to_object_array_tuples(data)
  143. idx_len = content.shape[0]
  144. arrays = convert_object_array(
  145. list(content.T),
  146. dtype=None,
  147. coerce_float=coerce_float,
  148. dtype_backend=dtype_backend,
  149. )
  150. if dtype_backend == "pyarrow":
  151. pa = import_optional_dependency("pyarrow")
  152. result_arrays = []
  153. for arr in arrays:
  154. pa_array = pa.array(arr, from_pandas=True)
  155. if arr.dtype == "string":
  156. # TODO: Arrow still infers strings arrays as regular strings instead
  157. # of large_string, which is what we preserver everywhere else for
  158. # dtype_backend="pyarrow". We may want to reconsider this
  159. pa_array = pa_array.cast(pa.string())
  160. result_arrays.append(ArrowExtensionArray(pa_array))
  161. arrays = result_arrays # type: ignore[assignment]
  162. if arrays:
  163. return DataFrame._from_arrays(
  164. arrays, columns=columns, index=range(idx_len), verify_integrity=False
  165. )
  166. else:
  167. return DataFrame(columns=columns)
  168. def _wrap_result(
  169. data,
  170. columns,
  171. index_col=None,
  172. coerce_float: bool = True,
  173. parse_dates=None,
  174. dtype: DtypeArg | None = None,
  175. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  176. ) -> DataFrame:
  177. """Wrap result set of a SQLAlchemy query in a DataFrame."""
  178. frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend)
  179. if dtype:
  180. frame = frame.astype(dtype)
  181. frame = _parse_date_columns(frame, parse_dates)
  182. if index_col is not None:
  183. frame = frame.set_index(index_col)
  184. return frame
  185. def _wrap_result_adbc(
  186. df: DataFrame,
  187. *,
  188. index_col=None,
  189. parse_dates=None,
  190. dtype: DtypeArg | None = None,
  191. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  192. ) -> DataFrame:
  193. """Wrap result set of a SQLAlchemy query in a DataFrame."""
  194. if dtype:
  195. df = df.astype(dtype)
  196. df = _parse_date_columns(df, parse_dates)
  197. if index_col is not None:
  198. df = df.set_index(index_col)
  199. return df
  200. # -----------------------------------------------------------------------------
  201. # -- Read and write to DataFrames
  202. @overload
  203. def read_sql_table( # pyright: ignore[reportOverlappingOverload]
  204. table_name: str,
  205. con,
  206. schema=...,
  207. index_col: str | list[str] | None = ...,
  208. coerce_float=...,
  209. parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
  210. columns: list[str] | None = ...,
  211. chunksize: None = ...,
  212. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  213. ) -> DataFrame: ...
  214. @overload
  215. def read_sql_table(
  216. table_name: str,
  217. con,
  218. schema=...,
  219. index_col: str | list[str] | None = ...,
  220. coerce_float=...,
  221. parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
  222. columns: list[str] | None = ...,
  223. chunksize: int = ...,
  224. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  225. ) -> Iterator[DataFrame]: ...
  226. @set_module("pandas")
  227. def read_sql_table(
  228. table_name: str,
  229. con,
  230. schema: str | None = None,
  231. index_col: str | list[str] | None = None,
  232. coerce_float: bool = True,
  233. parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None,
  234. columns: list[str] | None = None,
  235. chunksize: int | None = None,
  236. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  237. ) -> DataFrame | Iterator[DataFrame]:
  238. """
  239. Read SQL database table into a DataFrame.
  240. Given a table name and a SQLAlchemy connectable, returns a DataFrame.
  241. This function does not support DBAPI connections.
  242. Parameters
  243. ----------
  244. table_name : str
  245. Name of SQL table in database.
  246. con : SQLAlchemy connectable or str
  247. A database URI could be provided as str.
  248. SQLite DBAPI connection mode not supported.
  249. schema : str, default None
  250. Name of SQL schema in database to query (if database flavor
  251. supports this). Uses default schema if None (default).
  252. index_col : str or list of str, optional, default: None
  253. Column(s) to set as index(MultiIndex).
  254. coerce_float : bool, default True
  255. Attempts to convert values of non-string, non-numeric objects (like
  256. decimal.Decimal) to floating point. Can result in loss of Precision.
  257. parse_dates : list or dict, default None
  258. - List of column names to parse as dates.
  259. - Dict of ``{column_name: format string}`` where format string is
  260. strftime compatible in case of parsing string times or is one of
  261. (D, s, ns, ms, us) in case of parsing integer timestamps.
  262. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
  263. to the keyword arguments of :func:`pandas.to_datetime`
  264. Especially useful with databases without native Datetime support,
  265. such as SQLite.
  266. columns : list, default None
  267. List of column names to select from SQL table.
  268. chunksize : int, default None
  269. If specified, returns an iterator where `chunksize` is the number of
  270. rows to include in each chunk.
  271. dtype_backend : {'numpy_nullable', 'pyarrow'}
  272. Back-end data type applied to the resultant :class:`DataFrame`
  273. (still experimental). If not specified, the default behavior
  274. is to not use nullable data types. If specified, the behavior
  275. is as follows:
  276. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  277. * ``"pyarrow"``: returns pyarrow-backed nullable
  278. :class:`ArrowDtype` :class:`DataFrame`
  279. .. versionadded:: 2.0
  280. Returns
  281. -------
  282. DataFrame or Iterator[DataFrame]
  283. A SQL table is returned as two-dimensional data structure with labeled
  284. axes.
  285. See Also
  286. --------
  287. read_sql_query : Read SQL query into a DataFrame.
  288. read_sql : Read SQL query or database table into a DataFrame.
  289. Notes
  290. -----
  291. Any datetime values with time zone information will be converted to UTC.
  292. Examples
  293. --------
  294. >>> pd.read_sql_table("table_name", "postgres:///db_name") # doctest:+SKIP
  295. """
  296. check_dtype_backend(dtype_backend)
  297. if dtype_backend is lib.no_default:
  298. dtype_backend = "numpy" # type: ignore[assignment]
  299. assert dtype_backend is not lib.no_default
  300. with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql:
  301. if not pandas_sql.has_table(table_name):
  302. raise ValueError(f"Table {table_name} not found")
  303. table = pandas_sql.read_table(
  304. table_name,
  305. index_col=index_col,
  306. coerce_float=coerce_float,
  307. parse_dates=parse_dates,
  308. columns=columns,
  309. chunksize=chunksize,
  310. dtype_backend=dtype_backend,
  311. )
  312. if table is not None:
  313. return table
  314. else:
  315. raise ValueError(f"Table {table_name} not found", con)
  316. @overload
  317. def read_sql_query( # pyright: ignore[reportOverlappingOverload]
  318. sql,
  319. con,
  320. index_col: str | list[str] | None = ...,
  321. coerce_float=...,
  322. params: list[Any] | Mapping[str, Any] | None = ...,
  323. parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
  324. chunksize: None = ...,
  325. dtype: DtypeArg | None = ...,
  326. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  327. ) -> DataFrame: ...
  328. @overload
  329. def read_sql_query(
  330. sql,
  331. con,
  332. index_col: str | list[str] | None = ...,
  333. coerce_float=...,
  334. params: list[Any] | Mapping[str, Any] | None = ...,
  335. parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
  336. chunksize: int = ...,
  337. dtype: DtypeArg | None = ...,
  338. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  339. ) -> Iterator[DataFrame]: ...
  340. @set_module("pandas")
  341. def read_sql_query(
  342. sql,
  343. con,
  344. index_col: str | list[str] | None = None,
  345. coerce_float: bool = True,
  346. params: list[Any] | Mapping[str, Any] | None = None,
  347. parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None,
  348. chunksize: int | None = None,
  349. dtype: DtypeArg | None = None,
  350. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  351. ) -> DataFrame | Iterator[DataFrame]:
  352. """
  353. Read SQL query into a DataFrame.
  354. Returns a DataFrame corresponding to the result set of the query
  355. string. Optionally provide an `index_col` parameter to use one of the
  356. columns as the index, otherwise default integer index will be used.
  357. Parameters
  358. ----------
  359. sql : str SQL query or SQLAlchemy Selectable (select or text object)
  360. SQL query to be executed.
  361. con : SQLAlchemy connectable, str, or sqlite3 connection
  362. Using SQLAlchemy makes it possible to use any DB supported by that
  363. library. If a DBAPI2 object, only sqlite3 is supported.
  364. index_col : str or list of str, optional, default: None
  365. Column(s) to set as index(MultiIndex).
  366. coerce_float : bool, default True
  367. Attempts to convert values of non-string, non-numeric objects (like
  368. decimal.Decimal) to floating point. Useful for SQL result sets.
  369. params : list, tuple or mapping, optional, default: None
  370. List of parameters to pass to execute method. The syntax used
  371. to pass parameters is database driver dependent. Check your
  372. database driver documentation for which of the five syntax styles,
  373. described in PEP 249's paramstyle, is supported.
  374. Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
  375. parse_dates : list or dict, default: None
  376. - List of column names to parse as dates.
  377. - Dict of ``{column_name: format string}`` where format string is
  378. strftime compatible in case of parsing string times, or is one of
  379. (D, s, ns, ms, us) in case of parsing integer timestamps.
  380. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
  381. to the keyword arguments of :func:`pandas.to_datetime`
  382. Especially useful with databases without native Datetime support,
  383. such as SQLite.
  384. chunksize : int, default None
  385. If specified, return an iterator where `chunksize` is the number of
  386. rows to include in each chunk.
  387. dtype : Type name or dict of columns
  388. Data type for data or columns. E.g. np.float64 or
  389. {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
  390. dtype_backend : {'numpy_nullable', 'pyarrow'}
  391. Back-end data type applied to the resultant :class:`DataFrame`
  392. (still experimental). If not specified, the default behavior
  393. is to not use nullable data types. If specified, the behavior
  394. is as follows:
  395. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  396. * ``"pyarrow"``: returns pyarrow-backed nullable
  397. :class:`ArrowDtype` :class:`DataFrame`
  398. .. versionadded:: 2.0
  399. Returns
  400. -------
  401. DataFrame or Iterator[DataFrame]
  402. Returns a DataFrame object that contains the result set of the
  403. executed SQL query, in relation to the specified database connection.
  404. See Also
  405. --------
  406. read_sql_table : Read SQL database table into a DataFrame.
  407. read_sql : Read SQL query or database table into a DataFrame.
  408. Notes
  409. -----
  410. Any datetime values with time zone information parsed via the `parse_dates`
  411. parameter will be converted to UTC.
  412. Examples
  413. --------
  414. >>> from sqlalchemy import create_engine # doctest: +SKIP
  415. >>> engine = create_engine("sqlite:///database.db") # doctest: +SKIP
  416. >>> sql_query = "SELECT int_column FROM test_data" # doctest: +SKIP
  417. >>> with engine.connect() as conn, conn.begin(): # doctest: +SKIP
  418. ... data = pd.read_sql_query(sql_query, conn) # doctest: +SKIP
  419. """
  420. check_dtype_backend(dtype_backend)
  421. if dtype_backend is lib.no_default:
  422. dtype_backend = "numpy" # type: ignore[assignment]
  423. assert dtype_backend is not lib.no_default
  424. with pandasSQL_builder(con) as pandas_sql:
  425. return pandas_sql.read_query(
  426. sql,
  427. index_col=index_col,
  428. params=params,
  429. coerce_float=coerce_float,
  430. parse_dates=parse_dates,
  431. chunksize=chunksize,
  432. dtype=dtype,
  433. dtype_backend=dtype_backend,
  434. )
  435. @overload
  436. def read_sql( # pyright: ignore[reportOverlappingOverload]
  437. sql,
  438. con,
  439. index_col: str | list[str] | None = ...,
  440. coerce_float=...,
  441. params=...,
  442. parse_dates=...,
  443. columns: list[str] = ...,
  444. chunksize: None = ...,
  445. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  446. dtype: DtypeArg | None = None,
  447. ) -> DataFrame: ...
  448. @overload
  449. def read_sql(
  450. sql,
  451. con,
  452. index_col: str | list[str] | None = ...,
  453. coerce_float=...,
  454. params=...,
  455. parse_dates=...,
  456. columns: list[str] = ...,
  457. chunksize: int = ...,
  458. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  459. dtype: DtypeArg | None = None,
  460. ) -> Iterator[DataFrame]: ...
  461. @set_module("pandas")
  462. def read_sql(
  463. sql,
  464. con,
  465. index_col: str | list[str] | None = None,
  466. coerce_float: bool = True,
  467. params=None,
  468. parse_dates=None,
  469. columns: list[str] | None = None,
  470. chunksize: int | None = None,
  471. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  472. dtype: DtypeArg | None = None,
  473. ) -> DataFrame | Iterator[DataFrame]:
  474. """
  475. Read SQL query or database table into a DataFrame.
  476. This function is a convenience wrapper around ``read_sql_table`` and
  477. ``read_sql_query`` (for backward compatibility). It will delegate
  478. to the specific function depending on the provided input. A SQL query
  479. will be routed to ``read_sql_query``, while a database table name will
  480. be routed to ``read_sql_table``. Note that the delegated function might
  481. have more specific notes about their functionality not listed here.
  482. Parameters
  483. ----------
  484. sql : str or SQLAlchemy Selectable (select or text object)
  485. SQL query to be executed or a table name.
  486. con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
  487. ADBC provides high performance I/O with native type support, where available.
  488. Using SQLAlchemy makes it possible to use any DB supported by that
  489. library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
  490. for engine disposal and connection closure for the ADBC connection and
  491. SQLAlchemy connectable; str connections are closed automatically. See
  492. `here <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.
  493. index_col : str or list of str, optional, default: None
  494. Column(s) to set as index(MultiIndex).
  495. coerce_float : bool, default True
  496. Attempts to convert values of non-string, non-numeric objects (like
  497. decimal.Decimal) to floating point, useful for SQL result sets.
  498. params : list, tuple or dict, optional, default: None
  499. List of parameters to pass to execute method. The syntax used
  500. to pass parameters is database driver dependent. Check your
  501. database driver documentation for which of the five syntax styles,
  502. described in PEP 249's paramstyle, is supported.
  503. Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
  504. parse_dates : list or dict, default: None
  505. - List of column names to parse as dates.
  506. - Dict of ``{column_name: format string}`` where format string is
  507. strftime compatible in case of parsing string times, or is one of
  508. (D, s, ns, ms, us) in case of parsing integer timestamps.
  509. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
  510. to the keyword arguments of :func:`pandas.to_datetime`
  511. Especially useful with databases without native Datetime support,
  512. such as SQLite.
  513. columns : list, default: None
  514. List of column names to select from SQL table (only used when reading
  515. a table).
  516. chunksize : int, default None
  517. If specified, return an iterator where `chunksize` is the
  518. number of rows to include in each chunk.
  519. dtype_backend : {'numpy_nullable', 'pyarrow'}
  520. Back-end data type applied to the resultant :class:`DataFrame`
  521. (still experimental). If not specified, the default behavior
  522. is to not use nullable data types. If specified, the behavior
  523. is as follows:
  524. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  525. * ``"pyarrow"``: returns pyarrow-backed nullable
  526. :class:`ArrowDtype` :class:`DataFrame`
  527. .. versionadded:: 2.0
  528. dtype : Type name or dict of columns
  529. Data type for data or columns. E.g. np.float64 or
  530. {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
  531. The argument is ignored if a table is passed instead of a query.
  532. .. versionadded:: 2.0.0
  533. Returns
  534. -------
  535. DataFrame or Iterator[DataFrame]
  536. Returns a DataFrame object that contains the result set of the
  537. executed SQL query or an SQL Table based on the provided input,
  538. in relation to the specified database connection.
  539. See Also
  540. --------
  541. read_sql_table : Read SQL database table into a DataFrame.
  542. read_sql_query : Read SQL query into a DataFrame.
  543. Notes
  544. -----
  545. ``pandas`` does not attempt to sanitize SQL statements;
  546. instead it simply forwards the statement you are executing
  547. to the underlying driver, which may or may not sanitize from there.
  548. Please refer to the underlying driver documentation for any details.
  549. Generally, be wary when accepting statements from arbitrary sources.
  550. Examples
  551. --------
  552. Read data from SQL via either a SQL query or a SQL tablename.
  553. When using a SQLite database only SQL queries are accepted,
  554. providing only the SQL tablename will result in an error.
  555. >>> from sqlite3 import connect
  556. >>> conn = connect(":memory:")
  557. >>> df = pd.DataFrame(
  558. ... data=[[0, "10/11/12"], [1, "12/11/10"]],
  559. ... columns=["int_column", "date_column"],
  560. ... )
  561. >>> df.to_sql(name="test_data", con=conn)
  562. 2
  563. >>> pd.read_sql("SELECT int_column, date_column FROM test_data", conn)
  564. int_column date_column
  565. 0 0 10/11/12
  566. 1 1 12/11/10
  567. >>> pd.read_sql("test_data", "postgres:///db_name") # doctest:+SKIP
  568. For parameterized query, using ``params`` is recommended over string interpolation.
  569. >>> from sqlalchemy import text
  570. >>> sql = text(
  571. ... "SELECT int_column, date_column FROM test_data WHERE int_column=:int_val"
  572. ... )
  573. >>> pd.read_sql(sql, conn, params={"int_val": 1}) # doctest:+SKIP
  574. int_column date_column
  575. 0 1 12/11/10
  576. Apply date parsing to columns through the ``parse_dates`` argument
  577. The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns.
  578. Custom argument values for applying ``pd.to_datetime`` on a column are specified
  579. via a dictionary format:
  580. >>> pd.read_sql(
  581. ... "SELECT int_column, date_column FROM test_data",
  582. ... conn,
  583. ... parse_dates={"date_column": {"format": "%d/%m/%y"}},
  584. ... )
  585. int_column date_column
  586. 0 0 2012-11-10
  587. 1 1 2010-11-12
  588. .. versionadded:: 2.2.0
  589. pandas now supports reading via ADBC drivers
  590. >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP
  591. >>> with dbapi.connect("postgres:///db_name") as conn: # doctest:+SKIP
  592. ... pd.read_sql("SELECT int_column FROM test_data", conn)
  593. int_column
  594. 0 0
  595. 1 1
  596. """
  597. check_dtype_backend(dtype_backend)
  598. if dtype_backend is lib.no_default:
  599. dtype_backend = "numpy" # type: ignore[assignment]
  600. assert dtype_backend is not lib.no_default
  601. with pandasSQL_builder(con) as pandas_sql:
  602. if isinstance(pandas_sql, SQLiteDatabase):
  603. return pandas_sql.read_query(
  604. sql,
  605. index_col=index_col,
  606. params=params,
  607. coerce_float=coerce_float,
  608. parse_dates=parse_dates,
  609. chunksize=chunksize,
  610. dtype_backend=dtype_backend,
  611. dtype=dtype,
  612. )
  613. try:
  614. _is_table_name = pandas_sql.has_table(sql)
  615. except Exception:
  616. # using generic exception to catch errors from sql drivers (GH24988)
  617. _is_table_name = False
  618. if _is_table_name:
  619. return pandas_sql.read_table(
  620. sql,
  621. index_col=index_col,
  622. coerce_float=coerce_float,
  623. parse_dates=parse_dates,
  624. columns=columns,
  625. chunksize=chunksize,
  626. dtype_backend=dtype_backend,
  627. )
  628. else:
  629. return pandas_sql.read_query(
  630. sql,
  631. index_col=index_col,
  632. params=params,
  633. coerce_float=coerce_float,
  634. parse_dates=parse_dates,
  635. chunksize=chunksize,
  636. dtype_backend=dtype_backend,
  637. dtype=dtype,
  638. )
  639. def to_sql(
  640. frame,
  641. name: str,
  642. con,
  643. schema: str | None = None,
  644. if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
  645. index: bool = True,
  646. index_label: IndexLabel | None = None,
  647. chunksize: int | None = None,
  648. dtype: DtypeArg | None = None,
  649. method: Literal["multi"] | Callable | None = None,
  650. engine: str = "auto",
  651. **engine_kwargs,
  652. ) -> int | None:
  653. """
  654. Write records stored in a DataFrame to a SQL database.
  655. .. warning::
  656. The pandas library does not attempt to sanitize inputs provided via a to_sql call.
  657. Please refer to the documentation for the underlying database driver to see if it
  658. will properly prevent injection, or alternatively be advised of a security risk when
  659. executing arbitrary commands in a to_sql call.
  660. Parameters
  661. ----------
  662. frame : DataFrame, Series
  663. name : str
  664. Name of SQL table.
  665. con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
  666. or sqlite3 DBAPI2 connection
  667. ADBC provides high performance I/O with native type support, where available.
  668. Using SQLAlchemy makes it possible to use any DB supported by that
  669. library.
  670. If a DBAPI2 object, only sqlite3 is supported.
  671. schema : str, optional
  672. Name of SQL schema in database to write to (if database flavor
  673. supports this). If None, use default schema (default).
  674. if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
  675. - fail: If table exists, do nothing.
  676. - replace: If table exists, drop it, recreate it, and insert data.
  677. - append: If table exists, insert data. Create if does not exist.
  678. - delete_rows: If a table exists, delete all records and insert data.
  679. index : bool, default True
  680. Write DataFrame index as a column.
  681. index_label : str or sequence, optional
  682. Column label for index column(s). If None is given (default) and
  683. `index` is True, then the index names are used.
  684. A sequence should be given if the DataFrame uses MultiIndex.
  685. chunksize : int, optional
  686. Specify the number of rows in each batch to be written at a time.
  687. By default, all rows will be written at once.
  688. dtype : dict or scalar, optional
  689. Specifying the datatype for columns. If a dictionary is used, the
  690. keys should be the column names and the values should be the
  691. SQLAlchemy types or strings for the sqlite3 fallback mode. If a
  692. scalar is provided, it will be applied to all columns.
  693. method : {None, 'multi', callable}, optional
  694. Controls the SQL insertion clause used:
  695. - None : Uses standard SQL ``INSERT`` clause (one per row).
  696. - ``'multi'``: Pass multiple values in a single ``INSERT`` clause.
  697. - callable with signature ``(pd_table, conn, keys, data_iter) -> int | None``.
  698. Details and a sample callable implementation can be found in the
  699. section :ref:`insert method <io.sql.method>`.
  700. engine : {'auto', 'sqlalchemy'}, default 'auto'
  701. SQL engine library to use. If 'auto', then the option
  702. ``io.sql.engine`` is used. The default ``io.sql.engine``
  703. behavior is 'sqlalchemy'
  704. **engine_kwargs
  705. Any additional kwargs are passed to the engine.
  706. Returns
  707. -------
  708. None or int
  709. Number of rows affected by to_sql. None is returned if the callable
  710. passed into ``method`` does not return an integer number of rows.
  711. Notes
  712. -----
  713. The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor``
  714. or SQLAlchemy connectable. If using ADBC the returned rows are the result
  715. of ``Cursor.adbc_ingest``. The returned value may not reflect the exact number of written
  716. rows as stipulated in the
  717. `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
  718. `SQLAlchemy <https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.BaseCursorResult.rowcount>`__
  719. """ # noqa: E501
  720. if if_exists not in ("fail", "replace", "append", "delete_rows"):
  721. raise ValueError(f"'{if_exists}' is not valid for if_exists")
  722. if isinstance(frame, Series):
  723. frame = frame.to_frame()
  724. elif not isinstance(frame, DataFrame):
  725. raise NotImplementedError(
  726. "'frame' argument should be either a Series or a DataFrame"
  727. )
  728. with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql:
  729. return pandas_sql.to_sql(
  730. frame,
  731. name,
  732. if_exists=if_exists,
  733. index=index,
  734. index_label=index_label,
  735. schema=schema,
  736. chunksize=chunksize,
  737. dtype=dtype,
  738. method=method,
  739. engine=engine,
  740. **engine_kwargs,
  741. )
  742. def has_table(table_name: str, con, schema: str | None = None) -> bool:
  743. """
  744. Check if DataBase has named table.
  745. Parameters
  746. ----------
  747. table_name: string
  748. Name of SQL table.
  749. con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
  750. ADBC provides high performance I/O with native type support, where available.
  751. Using SQLAlchemy makes it possible to use any DB supported by that
  752. library.
  753. If a DBAPI2 object, only sqlite3 is supported.
  754. schema : string, default None
  755. Name of SQL schema in database to write to (if database flavor supports
  756. this). If None, use default schema (default).
  757. Returns
  758. -------
  759. boolean
  760. """
  761. with pandasSQL_builder(con, schema=schema) as pandas_sql:
  762. return pandas_sql.has_table(table_name)
  763. table_exists = has_table
  764. def pandasSQL_builder(
  765. con,
  766. schema: str | None = None,
  767. need_transaction: bool = False,
  768. ) -> PandasSQL:
  769. """
  770. Convenience function to return the correct PandasSQL subclass based on the
  771. provided parameters. Also creates a sqlalchemy connection and transaction
  772. if necessary.
  773. """
  774. import sqlite3
  775. if isinstance(con, sqlite3.Connection) or con is None:
  776. return SQLiteDatabase(con)
  777. sqlalchemy = import_optional_dependency("sqlalchemy", errors="ignore")
  778. if isinstance(con, str) and sqlalchemy is None:
  779. raise ImportError(
  780. f"Using URI string without version '{VERSIONS['sqlalchemy']}' or newer "
  781. "of 'sqlalchemy' installed."
  782. )
  783. if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)):
  784. return SQLDatabase(con, schema, need_transaction)
  785. adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
  786. if adbc and isinstance(con, adbc.Connection):
  787. return ADBCDatabase(con)
  788. warnings.warn(
  789. "pandas only supports SQLAlchemy connectable (engine/connection) or "
  790. "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 "
  791. "objects are not tested. Please consider using SQLAlchemy.",
  792. UserWarning,
  793. stacklevel=find_stack_level(),
  794. )
  795. return SQLiteDatabase(con)
  796. class SQLTable(PandasObject):
  797. """
  798. For mapping Pandas tables to SQL tables.
  799. Uses fact that table is reflected by SQLAlchemy to
  800. do better type conversions.
  801. Also holds various flags needed to avoid having to
  802. pass them between functions all the time.
  803. """
  804. # TODO: support for multiIndex
  805. def __init__(
  806. self,
  807. name: str,
  808. pandas_sql_engine,
  809. frame=None,
  810. index: bool | str | list[str] | None = True,
  811. if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
  812. prefix: str = "pandas",
  813. index_label=None,
  814. schema=None,
  815. keys=None,
  816. dtype: DtypeArg | None = None,
  817. ) -> None:
  818. self.name = name
  819. self.pd_sql = pandas_sql_engine
  820. self.prefix = prefix
  821. self.frame = frame
  822. self.index = self._index_name(index, index_label)
  823. self.schema = schema
  824. self.if_exists = if_exists
  825. self.keys = keys
  826. self.dtype = dtype
  827. if frame is not None:
  828. # We want to initialize based on a dataframe
  829. self.table = self._create_table_setup()
  830. else:
  831. # no data provided, read-only mode
  832. self.table = self.pd_sql.get_table(self.name, self.schema)
  833. if self.table is None:
  834. raise ValueError(f"Could not init table '{name}'")
  835. if not len(self.name):
  836. raise ValueError("Empty table name specified")
  837. def exists(self):
  838. return self.pd_sql.has_table(self.name, self.schema)
  839. def sql_schema(self) -> str:
  840. from sqlalchemy.schema import CreateTable
  841. return str(CreateTable(self.table).compile(self.pd_sql.con))
  842. def _execute_create(self) -> None:
  843. # Inserting table into database, add to MetaData object
  844. self.table = self.table.to_metadata(self.pd_sql.meta)
  845. with self.pd_sql.run_transaction():
  846. self.table.create(bind=self.pd_sql.con)
  847. def create(self) -> None:
  848. if self.exists():
  849. if self.if_exists == "fail":
  850. raise ValueError(f"Table '{self.name}' already exists.")
  851. elif self.if_exists == "replace":
  852. self.pd_sql.drop_table(self.name, self.schema)
  853. self._execute_create()
  854. elif self.if_exists == "append":
  855. pass
  856. elif self.if_exists == "delete_rows":
  857. self.pd_sql.delete_rows(self.name, self.schema)
  858. else:
  859. raise ValueError(f"'{self.if_exists}' is not valid for if_exists")
  860. else:
  861. self._execute_create()
  862. def _execute_insert(self, conn, keys: list[str], data_iter) -> int:
  863. """
  864. Execute SQL statement inserting data
  865. Parameters
  866. ----------
  867. conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
  868. keys : list of str
  869. Column names
  870. data_iter : generator of list
  871. Each item contains a list of values to be inserted
  872. """
  873. data = [dict(zip(keys, row, strict=True)) for row in data_iter]
  874. result = self.pd_sql.execute(self.table.insert(), data)
  875. return result.rowcount
  876. def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int:
  877. """
  878. Alternative to _execute_insert for DBs support multi-value INSERT.
  879. Note: multi-value insert is usually faster for analytics DBs
  880. and tables containing a few columns
  881. but performance degrades quickly with increase of columns.
  882. """
  883. from sqlalchemy import insert
  884. data = [dict(zip(keys, row, strict=True)) for row in data_iter]
  885. stmt = insert(self.table).values(data)
  886. result = self.pd_sql.execute(stmt)
  887. return result.rowcount
  888. def insert_data(self) -> tuple[list[str], list[np.ndarray]]:
  889. if self.index is not None:
  890. temp = self.frame.copy(deep=False)
  891. temp.index.names = self.index
  892. try:
  893. temp.reset_index(inplace=True)
  894. except ValueError as err:
  895. raise ValueError(f"duplicate name in index/columns: {err}") from err
  896. else:
  897. temp = self.frame
  898. column_names = list(map(str, temp.columns))
  899. ncols = len(column_names)
  900. # this just pre-allocates the list: None's will be replaced with ndarrays
  901. # error: List item 0 has incompatible type "None"; expected "ndarray"
  902. data_list: list[np.ndarray] = [None] * ncols # type: ignore[list-item]
  903. for i, (_, ser) in enumerate(temp.items()):
  904. if ser.dtype.kind == "M":
  905. if isinstance(ser._values, ArrowExtensionArray):
  906. import pyarrow as pa
  907. if pa.types.is_date(ser.dtype.pyarrow_dtype):
  908. # GH#53854 to_pydatetime not supported for pyarrow date dtypes
  909. d = ser._values.to_numpy(dtype=object)
  910. else:
  911. d = ser.dt.to_pydatetime()._values
  912. else:
  913. d = ser._values.to_pydatetime()
  914. elif ser.dtype.kind == "m":
  915. vals = ser._values
  916. if isinstance(vals, ArrowExtensionArray):
  917. vals = vals.to_numpy(dtype=np.dtype("m8[ns]"))
  918. # store as integers, see GH#6921, GH#7076
  919. d = vals.view("i8").astype(object)
  920. else:
  921. d = ser._values.astype(object)
  922. assert isinstance(d, np.ndarray), type(d)
  923. if ser._can_hold_na:
  924. # Note: this will miss timedeltas since they are converted to int
  925. mask = isna(d)
  926. d[mask] = None
  927. data_list[i] = d
  928. return column_names, data_list
  929. def insert(
  930. self,
  931. chunksize: int | None = None,
  932. method: Literal["multi"] | Callable | None = None,
  933. ) -> int | None:
  934. # set insert method
  935. if method is None:
  936. exec_insert = self._execute_insert
  937. elif method == "multi":
  938. exec_insert = self._execute_insert_multi
  939. elif callable(method):
  940. exec_insert = partial(method, self)
  941. else:
  942. raise ValueError(f"Invalid parameter `method`: {method}")
  943. keys, data_list = self.insert_data()
  944. nrows = len(self.frame)
  945. if nrows == 0:
  946. return 0
  947. if chunksize is None:
  948. chunksize = nrows
  949. elif chunksize == 0:
  950. raise ValueError("chunksize argument should be non-zero")
  951. chunks = (nrows // chunksize) + 1
  952. total_inserted = None
  953. with self.pd_sql.run_transaction() as conn:
  954. for i in range(chunks):
  955. start_i = i * chunksize
  956. end_i = min((i + 1) * chunksize, nrows)
  957. if start_i >= end_i:
  958. break
  959. chunk_iter = zip(
  960. *(arr[start_i:end_i] for arr in data_list), strict=True
  961. )
  962. num_inserted = exec_insert(conn, keys, chunk_iter)
  963. # GH 46891
  964. if num_inserted is not None:
  965. if total_inserted is None:
  966. total_inserted = num_inserted
  967. else:
  968. total_inserted += num_inserted
  969. return total_inserted
  970. def _query_iterator(
  971. self,
  972. result,
  973. exit_stack: ExitStack,
  974. chunksize: int | None,
  975. columns,
  976. coerce_float: bool = True,
  977. parse_dates=None,
  978. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  979. ) -> Generator[DataFrame]:
  980. """Return generator through chunked result set."""
  981. has_read_data = False
  982. with exit_stack:
  983. while True:
  984. data = result.fetchmany(chunksize)
  985. if not data:
  986. if not has_read_data:
  987. yield DataFrame.from_records(
  988. [], columns=columns, coerce_float=coerce_float
  989. )
  990. break
  991. has_read_data = True
  992. self.frame = _convert_arrays_to_dataframe(
  993. data, columns, coerce_float, dtype_backend
  994. )
  995. self._harmonize_columns(
  996. parse_dates=parse_dates, dtype_backend=dtype_backend
  997. )
  998. if self.index is not None:
  999. self.frame.set_index(self.index, inplace=True)
  1000. yield self.frame
  1001. def read(
  1002. self,
  1003. exit_stack: ExitStack,
  1004. coerce_float: bool = True,
  1005. parse_dates=None,
  1006. columns=None,
  1007. chunksize: int | None = None,
  1008. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1009. ) -> DataFrame | Iterator[DataFrame]:
  1010. from sqlalchemy import select
  1011. if columns is not None and len(columns) > 0:
  1012. cols = [self.table.c[n] for n in columns]
  1013. if self.index is not None:
  1014. for idx in self.index[::-1]:
  1015. cols.insert(0, self.table.c[idx])
  1016. sql_select = select(*cols)
  1017. else:
  1018. sql_select = select(self.table)
  1019. result = self.pd_sql.execute(sql_select)
  1020. column_names = result.keys()
  1021. if chunksize is not None:
  1022. return self._query_iterator(
  1023. result,
  1024. exit_stack,
  1025. chunksize,
  1026. column_names,
  1027. coerce_float=coerce_float,
  1028. parse_dates=parse_dates,
  1029. dtype_backend=dtype_backend,
  1030. )
  1031. else:
  1032. data = result.fetchall()
  1033. self.frame = _convert_arrays_to_dataframe(
  1034. data, column_names, coerce_float, dtype_backend
  1035. )
  1036. self._harmonize_columns(
  1037. parse_dates=parse_dates, dtype_backend=dtype_backend
  1038. )
  1039. if self.index is not None:
  1040. self.frame.set_index(self.index, inplace=True)
  1041. return self.frame
  1042. def _index_name(self, index, index_label):
  1043. # for writing: index=True to include index in sql table
  1044. if index is True:
  1045. nlevels = self.frame.index.nlevels
  1046. # if index_label is specified, set this as index name(s)
  1047. if index_label is not None:
  1048. if not isinstance(index_label, list):
  1049. index_label = [index_label]
  1050. if len(index_label) != nlevels:
  1051. raise ValueError(
  1052. "Length of 'index_label' should match number of "
  1053. f"levels, which is {nlevels}"
  1054. )
  1055. return index_label
  1056. # return the used column labels for the index columns
  1057. if (
  1058. nlevels == 1
  1059. and "index" not in self.frame.columns
  1060. and self.frame.index.name is None
  1061. ):
  1062. return ["index"]
  1063. else:
  1064. return com.fill_missing_names(self.frame.index.names)
  1065. # for reading: index=(list of) string to specify column to set as index
  1066. elif isinstance(index, str):
  1067. return [index]
  1068. elif isinstance(index, list):
  1069. return index
  1070. else:
  1071. return None
  1072. def _get_column_names_and_types(self, dtype_mapper):
  1073. column_names_and_types = []
  1074. if self.index is not None:
  1075. for i, idx_label in enumerate(self.index):
  1076. idx_type = dtype_mapper(self.frame.index._get_level_values(i))
  1077. column_names_and_types.append((str(idx_label), idx_type, True))
  1078. column_names_and_types += [
  1079. (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False)
  1080. for i in range(len(self.frame.columns))
  1081. ]
  1082. return column_names_and_types
  1083. def _create_table_setup(self):
  1084. from sqlalchemy import (
  1085. Column,
  1086. PrimaryKeyConstraint,
  1087. Table,
  1088. )
  1089. from sqlalchemy.schema import MetaData
  1090. column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type)
  1091. columns: list[Any] = [
  1092. Column(name, typ, index=is_index)
  1093. for name, typ, is_index in column_names_and_types
  1094. ]
  1095. if self.keys is not None:
  1096. if not is_list_like(self.keys):
  1097. keys = [self.keys]
  1098. else:
  1099. keys = self.keys
  1100. pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk")
  1101. columns.append(pkc)
  1102. schema = self.schema or self.pd_sql.meta.schema
  1103. # At this point, attach to new metadata, only attach to self.meta
  1104. # once table is created.
  1105. meta = MetaData()
  1106. return Table(self.name, meta, *columns, schema=schema)
  1107. def _harmonize_columns(
  1108. self,
  1109. parse_dates=None,
  1110. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1111. ) -> None:
  1112. """
  1113. Make the DataFrame's column types align with the SQL table
  1114. column types.
  1115. Need to work around limited NA value support. Floats are always
  1116. fine, ints must always be floats if there are Null values.
  1117. Booleans are hard because converting bool column with None replaces
  1118. all Nones with false. Therefore only convert bool if there are no
  1119. NA values.
  1120. Datetimes should already be converted to np.datetime64 if supported,
  1121. but here we also force conversion if required.
  1122. """
  1123. parse_dates = _process_parse_dates_argument(parse_dates)
  1124. for sql_col in self.table.columns:
  1125. col_name = sql_col.name
  1126. try:
  1127. df_col = self.frame[col_name]
  1128. # Handle date parsing upfront; don't try to convert columns
  1129. # twice
  1130. if col_name in parse_dates:
  1131. try:
  1132. fmt = parse_dates[col_name]
  1133. except TypeError:
  1134. fmt = None
  1135. self.frame[col_name] = _handle_date_column(df_col, format=fmt)
  1136. continue
  1137. # the type the dataframe column should have
  1138. col_type = self._get_dtype(sql_col.type)
  1139. if (
  1140. col_type is datetime
  1141. or col_type is date
  1142. or col_type is DatetimeTZDtype
  1143. ):
  1144. # Convert tz-aware Datetime SQL columns to UTC
  1145. utc = col_type is DatetimeTZDtype
  1146. self.frame[col_name] = _handle_date_column(df_col, utc=utc)
  1147. elif dtype_backend == "numpy" and col_type is float:
  1148. # floats support NA, can always convert!
  1149. self.frame[col_name] = df_col.astype(col_type)
  1150. elif (
  1151. using_string_dtype()
  1152. and is_string_dtype(col_type)
  1153. and is_object_dtype(self.frame[col_name])
  1154. ):
  1155. self.frame[col_name] = df_col.astype(col_type)
  1156. elif dtype_backend == "numpy" and len(df_col) == df_col.count():
  1157. # No NA values, can convert ints and bools
  1158. if col_type is np.dtype("int64") or col_type is bool:
  1159. self.frame[col_name] = df_col.astype(col_type)
  1160. except KeyError:
  1161. pass # this column not in results
  1162. def _sqlalchemy_type(self, col: Index | Series):
  1163. dtype: DtypeArg = self.dtype or {}
  1164. if is_dict_like(dtype):
  1165. dtype = cast(dict, dtype)
  1166. if col.name in dtype:
  1167. return dtype[col.name]
  1168. # Infer type of column, while ignoring missing values.
  1169. # Needed for inserting typed data containing NULLs, GH 8778.
  1170. col_type = lib.infer_dtype(col, skipna=True)
  1171. from sqlalchemy.types import (
  1172. TIMESTAMP,
  1173. BigInteger,
  1174. Boolean,
  1175. Date,
  1176. DateTime,
  1177. Float,
  1178. Integer,
  1179. SmallInteger,
  1180. Text,
  1181. Time,
  1182. )
  1183. if col_type in ("datetime64", "datetime"):
  1184. # GH 9086: TIMESTAMP is the suggested type if the column contains
  1185. # timezone information
  1186. try:
  1187. # error: Item "Index" of "Union[Index, Series]" has no attribute "dt"
  1188. if col.dt.tz is not None: # type: ignore[union-attr]
  1189. return TIMESTAMP(timezone=True)
  1190. except AttributeError:
  1191. # The column is actually a DatetimeIndex
  1192. # GH 26761 or an Index with date-like data e.g. 9999-01-01
  1193. if getattr(col, "tz", None) is not None:
  1194. return TIMESTAMP(timezone=True)
  1195. return DateTime
  1196. if col_type == "timedelta64":
  1197. warnings.warn(
  1198. "the 'timedelta' type is not supported, and will be "
  1199. "written as integer values (ns frequency) to the database.",
  1200. UserWarning,
  1201. stacklevel=find_stack_level(),
  1202. )
  1203. return BigInteger
  1204. elif col_type == "floating":
  1205. if col.dtype == "float32":
  1206. return Float(precision=23)
  1207. else:
  1208. return Float(precision=53)
  1209. elif col_type == "integer":
  1210. # GH35076 Map pandas integer to optimal SQLAlchemy integer type
  1211. if col.dtype.name.lower() in ("int8", "uint8", "int16"):
  1212. return SmallInteger
  1213. elif col.dtype.name.lower() in ("uint16", "int32"):
  1214. return Integer
  1215. elif col.dtype.name.lower() == "uint64":
  1216. raise ValueError("Unsigned 64 bit integer datatype is not supported")
  1217. else:
  1218. return BigInteger
  1219. elif col_type == "boolean":
  1220. return Boolean
  1221. elif col_type == "date":
  1222. return Date
  1223. elif col_type == "time":
  1224. return Time
  1225. elif col_type == "complex":
  1226. raise ValueError("Complex datatypes not supported")
  1227. return Text
  1228. def _get_dtype(self, sqltype):
  1229. from sqlalchemy.types import (
  1230. TIMESTAMP,
  1231. Boolean,
  1232. Date,
  1233. DateTime,
  1234. Float,
  1235. Integer,
  1236. String,
  1237. )
  1238. if isinstance(sqltype, Float):
  1239. return float
  1240. elif isinstance(sqltype, Integer):
  1241. # TODO: Refine integer size.
  1242. return np.dtype("int64")
  1243. elif isinstance(sqltype, TIMESTAMP):
  1244. # we have a timezone capable type
  1245. if not sqltype.timezone:
  1246. return datetime
  1247. return DatetimeTZDtype
  1248. elif isinstance(sqltype, DateTime):
  1249. # Caution: np.datetime64 is also a subclass of np.number.
  1250. return datetime
  1251. elif isinstance(sqltype, Date):
  1252. return date
  1253. elif isinstance(sqltype, Boolean):
  1254. return bool
  1255. elif isinstance(sqltype, String):
  1256. if using_string_dtype():
  1257. return StringDtype(na_value=np.nan)
  1258. return object
  1259. class PandasSQL(PandasObject, ABC):
  1260. """
  1261. Subclasses Should define read_query and to_sql.
  1262. """
  1263. def __enter__(self) -> Self:
  1264. return self
  1265. def __exit__(self, *args) -> None:
  1266. pass
  1267. def read_table(
  1268. self,
  1269. table_name: str,
  1270. index_col: str | list[str] | None = None,
  1271. coerce_float: bool = True,
  1272. parse_dates=None,
  1273. columns=None,
  1274. schema: str | None = None,
  1275. chunksize: int | None = None,
  1276. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1277. ) -> DataFrame | Iterator[DataFrame]:
  1278. raise NotImplementedError
  1279. @abstractmethod
  1280. def read_query(
  1281. self,
  1282. sql: str,
  1283. index_col: str | list[str] | None = None,
  1284. coerce_float: bool = True,
  1285. parse_dates=None,
  1286. params=None,
  1287. chunksize: int | None = None,
  1288. dtype: DtypeArg | None = None,
  1289. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1290. ) -> DataFrame | Iterator[DataFrame]:
  1291. pass
  1292. @abstractmethod
  1293. def to_sql(
  1294. self,
  1295. frame,
  1296. name: str,
  1297. if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
  1298. index: bool = True,
  1299. index_label=None,
  1300. schema=None,
  1301. chunksize: int | None = None,
  1302. dtype: DtypeArg | None = None,
  1303. method: Literal["multi"] | Callable | None = None,
  1304. engine: str = "auto",
  1305. **engine_kwargs,
  1306. ) -> int | None:
  1307. pass
  1308. @abstractmethod
  1309. def execute(self, sql: str | Select | TextClause, params=None):
  1310. pass
  1311. @abstractmethod
  1312. def has_table(self, name: str, schema: str | None = None) -> bool:
  1313. pass
  1314. @abstractmethod
  1315. def _create_sql_schema(
  1316. self,
  1317. frame: DataFrame,
  1318. table_name: str,
  1319. keys: list[str] | None = None,
  1320. dtype: DtypeArg | None = None,
  1321. schema: str | None = None,
  1322. ) -> str:
  1323. pass
  1324. class BaseEngine:
  1325. def insert_records(
  1326. self,
  1327. table: SQLTable,
  1328. con,
  1329. frame,
  1330. name: str,
  1331. index: bool | str | list[str] | None = True,
  1332. schema=None,
  1333. chunksize: int | None = None,
  1334. method=None,
  1335. **engine_kwargs,
  1336. ) -> int | None:
  1337. """
  1338. Inserts data into already-prepared table
  1339. """
  1340. raise AbstractMethodError(self)
  1341. class SQLAlchemyEngine(BaseEngine):
  1342. def __init__(self) -> None:
  1343. import_optional_dependency(
  1344. "sqlalchemy", extra="sqlalchemy is required for SQL support."
  1345. )
  1346. def insert_records(
  1347. self,
  1348. table: SQLTable,
  1349. con,
  1350. frame,
  1351. name: str,
  1352. index: bool | str | list[str] | None = True,
  1353. schema=None,
  1354. chunksize: int | None = None,
  1355. method=None,
  1356. **engine_kwargs,
  1357. ) -> int | None:
  1358. from sqlalchemy import exc
  1359. try:
  1360. return table.insert(chunksize=chunksize, method=method)
  1361. except exc.StatementError as err:
  1362. # GH34431
  1363. # https://stackoverflow.com/a/67358288/6067848
  1364. msg = r"""(\(1054, "Unknown column 'inf(e0)?' in 'field list'"\))(?#
  1365. )|inf can not be used with MySQL"""
  1366. err_text = str(err.orig)
  1367. if re.search(msg, err_text):
  1368. raise ValueError("inf cannot be used with MySQL") from err
  1369. raise err
  1370. def get_engine(engine: str) -> BaseEngine:
  1371. """return our implementation"""
  1372. if engine == "auto":
  1373. engine = get_option("io.sql.engine")
  1374. if engine == "auto":
  1375. # try engines in this order
  1376. engine_classes = [SQLAlchemyEngine]
  1377. error_msgs = ""
  1378. for engine_class in engine_classes:
  1379. try:
  1380. return engine_class()
  1381. except ImportError as err:
  1382. error_msgs += "\n - " + str(err)
  1383. raise ImportError(
  1384. "Unable to find a usable engine; "
  1385. "tried using: 'sqlalchemy'.\n"
  1386. "A suitable version of "
  1387. "sqlalchemy is required for sql I/O "
  1388. "support.\n"
  1389. "Trying to import the above resulted in these errors:"
  1390. f"{error_msgs}"
  1391. )
  1392. if engine == "sqlalchemy":
  1393. return SQLAlchemyEngine()
  1394. raise ValueError("engine must be one of 'auto', 'sqlalchemy'")
  1395. class SQLDatabase(PandasSQL):
  1396. """
  1397. This class enables conversion between DataFrame and SQL databases
  1398. using SQLAlchemy to handle DataBase abstraction.
  1399. Parameters
  1400. ----------
  1401. con : SQLAlchemy Connectable or URI string.
  1402. Connectable to connect with the database. Using SQLAlchemy makes it
  1403. possible to use any DB supported by that library.
  1404. schema : string, default None
  1405. Name of SQL schema in database to write to (if database flavor
  1406. supports this). If None, use default schema (default).
  1407. need_transaction : bool, default False
  1408. If True, SQLDatabase will create a transaction.
  1409. """
  1410. def __init__(
  1411. self, con, schema: str | None = None, need_transaction: bool = False
  1412. ) -> None:
  1413. from sqlalchemy import create_engine
  1414. from sqlalchemy.engine import Engine
  1415. from sqlalchemy.schema import MetaData
  1416. # self.exit_stack cleans up the Engine and Connection and commits the
  1417. # transaction if any of those objects was created below.
  1418. # Cleanup happens either in self.__exit__ or at the end of the iterator
  1419. # returned by read_sql when chunksize is not None.
  1420. self.exit_stack = ExitStack()
  1421. if isinstance(con, str):
  1422. con = create_engine(con)
  1423. self.exit_stack.callback(con.dispose)
  1424. if isinstance(con, Engine):
  1425. con = self.exit_stack.enter_context(con.connect())
  1426. if need_transaction and not con.in_transaction():
  1427. self.exit_stack.enter_context(con.begin())
  1428. self.con = con
  1429. self.meta = MetaData(schema=schema)
  1430. self.returns_generator = False
  1431. def __exit__(self, *args) -> None:
  1432. if not self.returns_generator:
  1433. self.exit_stack.close()
  1434. @contextmanager
  1435. def run_transaction(self):
  1436. if not self.con.in_transaction():
  1437. with self.con.begin():
  1438. yield self.con
  1439. else:
  1440. yield self.con
  1441. def execute(self, sql: str | Select | TextClause | Delete, params=None):
  1442. """Simple passthrough to SQLAlchemy connectable"""
  1443. from sqlalchemy.exc import SQLAlchemyError
  1444. args = [] if params is None else [params]
  1445. if isinstance(sql, str):
  1446. execute_function = self.con.exec_driver_sql
  1447. else:
  1448. execute_function = self.con.execute
  1449. try:
  1450. return execute_function(sql, *args)
  1451. except SQLAlchemyError as exc:
  1452. raise DatabaseError(f"Execution failed on sql '{sql}': {exc}") from exc
  1453. def read_table(
  1454. self,
  1455. table_name: str,
  1456. index_col: str | list[str] | None = None,
  1457. coerce_float: bool = True,
  1458. parse_dates=None,
  1459. columns=None,
  1460. schema: str | None = None,
  1461. chunksize: int | None = None,
  1462. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1463. ) -> DataFrame | Iterator[DataFrame]:
  1464. """
  1465. Read SQL database table into a DataFrame.
  1466. Parameters
  1467. ----------
  1468. table_name : str
  1469. Name of SQL table in database.
  1470. index_col : string, optional, default: None
  1471. Column to set as index.
  1472. coerce_float : bool, default True
  1473. Attempts to convert values of non-string, non-numeric objects
  1474. (like decimal.Decimal) to floating point. This can result in
  1475. loss of precision.
  1476. parse_dates : list or dict, default: None
  1477. - List of column names to parse as dates.
  1478. - Dict of ``{column_name: format string}`` where format string is
  1479. strftime compatible in case of parsing string times, or is one of
  1480. (D, s, ns, ms, us) in case of parsing integer timestamps.
  1481. - Dict of ``{column_name: arg}``, where the arg corresponds
  1482. to the keyword arguments of :func:`pandas.to_datetime`.
  1483. Especially useful with databases without native Datetime support,
  1484. such as SQLite.
  1485. columns : list, default: None
  1486. List of column names to select from SQL table.
  1487. schema : string, default None
  1488. Name of SQL schema in database to query (if database flavor
  1489. supports this). If specified, this overwrites the default
  1490. schema of the SQL database object.
  1491. chunksize : int, default None
  1492. If specified, return an iterator where `chunksize` is the number
  1493. of rows to include in each chunk.
  1494. dtype_backend : {'numpy_nullable', 'pyarrow'}
  1495. Back-end data type applied to the resultant :class:`DataFrame`
  1496. (still experimental). If not specified, the default behavior
  1497. is to not use nullable data types. If specified, the behavior
  1498. is as follows:
  1499. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  1500. * ``"pyarrow"``: returns pyarrow-backed nullable
  1501. :class:`ArrowDtype` :class:`DataFrame`
  1502. .. versionadded:: 2.0
  1503. Returns
  1504. -------
  1505. DataFrame
  1506. See Also
  1507. --------
  1508. pandas.read_sql_table
  1509. SQLDatabase.read_query
  1510. """
  1511. self.meta.reflect(bind=self.con, only=[table_name], views=True)
  1512. table = SQLTable(table_name, self, index=index_col, schema=schema)
  1513. if chunksize is not None:
  1514. self.returns_generator = True
  1515. return table.read(
  1516. self.exit_stack,
  1517. coerce_float=coerce_float,
  1518. parse_dates=parse_dates,
  1519. columns=columns,
  1520. chunksize=chunksize,
  1521. dtype_backend=dtype_backend,
  1522. )
  1523. @staticmethod
  1524. def _query_iterator(
  1525. result,
  1526. exit_stack: ExitStack,
  1527. chunksize: int,
  1528. columns,
  1529. index_col=None,
  1530. coerce_float: bool = True,
  1531. parse_dates=None,
  1532. dtype: DtypeArg | None = None,
  1533. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1534. ) -> Generator[DataFrame]:
  1535. """Return generator through chunked result set"""
  1536. has_read_data = False
  1537. with exit_stack:
  1538. while True:
  1539. data = result.fetchmany(chunksize)
  1540. if not data:
  1541. if not has_read_data:
  1542. yield _wrap_result(
  1543. [],
  1544. columns,
  1545. index_col=index_col,
  1546. coerce_float=coerce_float,
  1547. parse_dates=parse_dates,
  1548. dtype=dtype,
  1549. dtype_backend=dtype_backend,
  1550. )
  1551. break
  1552. has_read_data = True
  1553. yield _wrap_result(
  1554. data,
  1555. columns,
  1556. index_col=index_col,
  1557. coerce_float=coerce_float,
  1558. parse_dates=parse_dates,
  1559. dtype=dtype,
  1560. dtype_backend=dtype_backend,
  1561. )
  1562. def read_query(
  1563. self,
  1564. sql: str,
  1565. index_col: str | list[str] | None = None,
  1566. coerce_float: bool = True,
  1567. parse_dates=None,
  1568. params=None,
  1569. chunksize: int | None = None,
  1570. dtype: DtypeArg | None = None,
  1571. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1572. ) -> DataFrame | Iterator[DataFrame]:
  1573. """
  1574. Read SQL query into a DataFrame.
  1575. Parameters
  1576. ----------
  1577. sql : str
  1578. SQL query to be executed.
  1579. index_col : string, optional, default: None
  1580. Column name to use as index for the returned DataFrame object.
  1581. coerce_float : bool, default True
  1582. Attempt to convert values of non-string, non-numeric objects (like
  1583. decimal.Decimal) to floating point, useful for SQL result sets.
  1584. params : list, tuple or dict, optional, default: None
  1585. List of parameters to pass to execute method. The syntax used
  1586. to pass parameters is database driver dependent. Check your
  1587. database driver documentation for which of the five syntax styles,
  1588. described in PEP 249's paramstyle, is supported.
  1589. Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}
  1590. parse_dates : list or dict, default: None
  1591. - List of column names to parse as dates.
  1592. - Dict of ``{column_name: format string}`` where format string is
  1593. strftime compatible in case of parsing string times, or is one of
  1594. (D, s, ns, ms, us) in case of parsing integer timestamps.
  1595. - Dict of ``{column_name: arg dict}``, where the arg dict
  1596. corresponds to the keyword arguments of
  1597. :func:`pandas.to_datetime` Especially useful with databases
  1598. without native Datetime support, such as SQLite.
  1599. chunksize : int, default None
  1600. If specified, return an iterator where `chunksize` is the number
  1601. of rows to include in each chunk.
  1602. dtype : Type name or dict of columns
  1603. Data type for data or columns. E.g. np.float64 or
  1604. {'a': np.float64, 'b': np.int32, 'c': 'Int64'}
  1605. Returns
  1606. -------
  1607. DataFrame
  1608. See Also
  1609. --------
  1610. read_sql_table : Read SQL database table into a DataFrame.
  1611. read_sql
  1612. """
  1613. result = self.execute(sql, params)
  1614. columns = result.keys()
  1615. if chunksize is not None:
  1616. self.returns_generator = True
  1617. return self._query_iterator(
  1618. result,
  1619. self.exit_stack,
  1620. chunksize,
  1621. columns,
  1622. index_col=index_col,
  1623. coerce_float=coerce_float,
  1624. parse_dates=parse_dates,
  1625. dtype=dtype,
  1626. dtype_backend=dtype_backend,
  1627. )
  1628. else:
  1629. data = result.fetchall()
  1630. frame = _wrap_result(
  1631. data,
  1632. columns,
  1633. index_col=index_col,
  1634. coerce_float=coerce_float,
  1635. parse_dates=parse_dates,
  1636. dtype=dtype,
  1637. dtype_backend=dtype_backend,
  1638. )
  1639. return frame
  1640. read_sql = read_query
  1641. def prep_table(
  1642. self,
  1643. frame,
  1644. name: str,
  1645. if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
  1646. index: bool | str | list[str] | None = True,
  1647. index_label=None,
  1648. schema=None,
  1649. dtype: DtypeArg | None = None,
  1650. ) -> SQLTable:
  1651. """
  1652. Prepares table in the database for data insertion. Creates it if needed, etc.
  1653. """
  1654. if dtype:
  1655. if not is_dict_like(dtype):
  1656. # error: Value expression in dictionary comprehension has incompatible
  1657. # type "Union[ExtensionDtype, str, dtype[Any], Type[object],
  1658. # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
  1659. # Type[str], Type[float], Type[int], Type[complex], Type[bool],
  1660. # Type[object]]]]"; expected type "Union[ExtensionDtype, str,
  1661. # dtype[Any], Type[object]]"
  1662. dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type]
  1663. else:
  1664. dtype = cast(dict, dtype)
  1665. from sqlalchemy.types import TypeEngine
  1666. for col, my_type in dtype.items():
  1667. if isinstance(my_type, type) and issubclass(my_type, TypeEngine):
  1668. pass
  1669. elif isinstance(my_type, TypeEngine):
  1670. pass
  1671. else:
  1672. raise ValueError(f"The type of {col} is not a SQLAlchemy type")
  1673. table = SQLTable(
  1674. name,
  1675. self,
  1676. frame=frame,
  1677. index=index,
  1678. if_exists=if_exists,
  1679. index_label=index_label,
  1680. schema=schema,
  1681. dtype=dtype,
  1682. )
  1683. table.create()
  1684. return table
  1685. def check_case_sensitive(
  1686. self,
  1687. name: str,
  1688. schema: str | None,
  1689. ) -> None:
  1690. """
  1691. Checks table name for issues with case-sensitivity.
  1692. Method is called after data is inserted.
  1693. """
  1694. if not name.isdigit() and not name.islower():
  1695. # check for potentially case sensitivity issues (GH7815)
  1696. # Only check when name is not a number and name is not lower case
  1697. from sqlalchemy import inspect as sqlalchemy_inspect
  1698. insp = sqlalchemy_inspect(self.con)
  1699. table_names = insp.get_table_names(schema=schema or self.meta.schema)
  1700. if name not in table_names:
  1701. msg = (
  1702. f"The provided table name '{name}' is not found exactly as "
  1703. "such in the database after writing the table, possibly "
  1704. "due to case sensitivity issues. Consider using lower "
  1705. "case table names."
  1706. )
  1707. warnings.warn(
  1708. msg,
  1709. UserWarning,
  1710. stacklevel=find_stack_level(),
  1711. )
  1712. def to_sql(
  1713. self,
  1714. frame,
  1715. name: str,
  1716. if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
  1717. index: bool = True,
  1718. index_label=None,
  1719. schema: str | None = None,
  1720. chunksize: int | None = None,
  1721. dtype: DtypeArg | None = None,
  1722. method: Literal["multi"] | Callable | None = None,
  1723. engine: str = "auto",
  1724. **engine_kwargs,
  1725. ) -> int | None:
  1726. """
  1727. Write records stored in a DataFrame to a SQL database.
  1728. Parameters
  1729. ----------
  1730. frame : DataFrame
  1731. name : string
  1732. Name of SQL table.
  1733. if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
  1734. - fail: If table exists, do nothing.
  1735. - replace: If table exists, drop it, recreate it, and insert data.
  1736. - append: If table exists, insert data. Create if does not exist.
  1737. - delete_rows: If a table exists, delete all records and insert data.
  1738. index : boolean, default True
  1739. Write DataFrame index as a column.
  1740. index_label : string or sequence, default None
  1741. Column label for index column(s). If None is given (default) and
  1742. `index` is True, then the index names are used.
  1743. A sequence should be given if the DataFrame uses MultiIndex.
  1744. schema : string, default None
  1745. Name of SQL schema in database to write to (if database flavor
  1746. supports this). If specified, this overwrites the default
  1747. schema of the SQLDatabase object.
  1748. chunksize : int, default None
  1749. If not None, then rows will be written in batches of this size at a
  1750. time. If None, all rows will be written at once.
  1751. dtype : single type or dict of column name to SQL type, default None
  1752. Optional specifying the datatype for columns. The SQL type should
  1753. be a SQLAlchemy type. If all columns are of the same type, one
  1754. single value can be used.
  1755. method : {None', 'multi', callable}, default None
  1756. Controls the SQL insertion clause used:
  1757. * None : Uses standard SQL ``INSERT`` clause (one per row).
  1758. * 'multi': Pass multiple values in a single ``INSERT`` clause.
  1759. * callable with signature ``(pd_table, conn, keys, data_iter)``.
  1760. Details and a sample callable implementation can be found in the
  1761. section :ref:`insert method <io.sql.method>`.
  1762. engine : {'auto', 'sqlalchemy'}, default 'auto'
  1763. SQL engine library to use. If 'auto', then the option
  1764. ``io.sql.engine`` is used. The default ``io.sql.engine``
  1765. behavior is 'sqlalchemy'
  1766. **engine_kwargs
  1767. Any additional kwargs are passed to the engine.
  1768. """
  1769. sql_engine = get_engine(engine)
  1770. table = self.prep_table(
  1771. frame=frame,
  1772. name=name,
  1773. if_exists=if_exists,
  1774. index=index,
  1775. index_label=index_label,
  1776. schema=schema,
  1777. dtype=dtype,
  1778. )
  1779. total_inserted = sql_engine.insert_records(
  1780. table=table,
  1781. con=self.con,
  1782. frame=frame,
  1783. name=name,
  1784. index=index,
  1785. schema=schema,
  1786. chunksize=chunksize,
  1787. method=method,
  1788. **engine_kwargs,
  1789. )
  1790. self.check_case_sensitive(name=name, schema=schema)
  1791. return total_inserted
  1792. @property
  1793. def tables(self):
  1794. return self.meta.tables
  1795. def has_table(self, name: str, schema: str | None = None) -> bool:
  1796. from sqlalchemy import inspect as sqlalchemy_inspect
  1797. insp = sqlalchemy_inspect(self.con)
  1798. return insp.has_table(name, schema or self.meta.schema)
  1799. def get_table(self, table_name: str, schema: str | None = None) -> Table:
  1800. from sqlalchemy import (
  1801. Numeric,
  1802. Table,
  1803. )
  1804. schema = schema or self.meta.schema
  1805. tbl = Table(table_name, self.meta, autoload_with=self.con, schema=schema)
  1806. for column in tbl.columns:
  1807. if isinstance(column.type, Numeric):
  1808. column.type.asdecimal = False
  1809. return tbl
  1810. def drop_table(self, table_name: str, schema: str | None = None) -> None:
  1811. schema = schema or self.meta.schema
  1812. if self.has_table(table_name, schema):
  1813. self.meta.reflect(
  1814. bind=self.con, only=[table_name], schema=schema, views=True
  1815. )
  1816. with self.run_transaction():
  1817. self.get_table(table_name, schema).drop(bind=self.con)
  1818. self.meta.clear()
  1819. def delete_rows(self, table_name: str, schema: str | None = None) -> None:
  1820. schema = schema or self.meta.schema
  1821. if self.has_table(table_name, schema):
  1822. self.meta.reflect(
  1823. bind=self.con, only=[table_name], schema=schema, views=True
  1824. )
  1825. table = self.get_table(table_name, schema)
  1826. self.execute(table.delete()).close()
  1827. self.meta.clear()
  1828. def _create_sql_schema(
  1829. self,
  1830. frame: DataFrame,
  1831. table_name: str,
  1832. keys: list[str] | None = None,
  1833. dtype: DtypeArg | None = None,
  1834. schema: str | None = None,
  1835. ) -> str:
  1836. table = SQLTable(
  1837. table_name,
  1838. self,
  1839. frame=frame,
  1840. index=False,
  1841. keys=keys,
  1842. dtype=dtype,
  1843. schema=schema,
  1844. )
  1845. return str(table.sql_schema())
  1846. # ---- SQL without SQLAlchemy ---
  1847. class ADBCDatabase(PandasSQL):
  1848. """
  1849. This class enables conversion between DataFrame and SQL databases
  1850. using ADBC to handle DataBase abstraction.
  1851. Parameters
  1852. ----------
  1853. con : adbc_driver_manager.dbapi.Connection
  1854. """
  1855. def __init__(self, con) -> None:
  1856. self.con = con
  1857. @contextmanager
  1858. def run_transaction(self):
  1859. with self.con.cursor() as cur:
  1860. try:
  1861. yield cur
  1862. except Exception:
  1863. self.con.rollback()
  1864. raise
  1865. self.con.commit()
  1866. def execute(self, sql: str | Select | TextClause, params=None):
  1867. from adbc_driver_manager import Error
  1868. if not isinstance(sql, str):
  1869. raise TypeError("Query must be a string unless using sqlalchemy.")
  1870. args = [] if params is None else [params]
  1871. cur = self.con.cursor()
  1872. try:
  1873. cur.execute(sql, *args)
  1874. return cur
  1875. except Error as exc:
  1876. try:
  1877. self.con.rollback()
  1878. except Error as inner_exc: # pragma: no cover
  1879. ex = DatabaseError(
  1880. f"Execution failed on sql: {sql}\n{exc}\nunable to rollback"
  1881. )
  1882. raise ex from inner_exc
  1883. ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
  1884. raise ex from exc
  1885. def read_table(
  1886. self,
  1887. table_name: str,
  1888. index_col: str | list[str] | None = None,
  1889. coerce_float: bool = True,
  1890. parse_dates=None,
  1891. columns=None,
  1892. schema: str | None = None,
  1893. chunksize: int | None = None,
  1894. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1895. ) -> DataFrame | Iterator[DataFrame]:
  1896. """
  1897. Read SQL database table into a DataFrame.
  1898. Parameters
  1899. ----------
  1900. table_name : str
  1901. Name of SQL table in database.
  1902. coerce_float : bool, default True
  1903. Raises NotImplementedError
  1904. parse_dates : list or dict, default: None
  1905. - List of column names to parse as dates.
  1906. - Dict of ``{column_name: format string}`` where format string is
  1907. strftime compatible in case of parsing string times, or is one of
  1908. (D, s, ns, ms, us) in case of parsing integer timestamps.
  1909. - Dict of ``{column_name: arg}``, where the arg corresponds
  1910. to the keyword arguments of :func:`pandas.to_datetime`.
  1911. Especially useful with databases without native Datetime support,
  1912. such as SQLite.
  1913. columns : list, default: None
  1914. List of column names to select from SQL table.
  1915. schema : string, default None
  1916. Name of SQL schema in database to query (if database flavor
  1917. supports this). If specified, this overwrites the default
  1918. schema of the SQL database object.
  1919. chunksize : int, default None
  1920. Raises NotImplementedError
  1921. dtype_backend : {'numpy_nullable', 'pyarrow'}
  1922. Back-end data type applied to the resultant :class:`DataFrame`
  1923. (still experimental). If not specified, the default behavior
  1924. is to not use nullable data types. If specified, the behavior
  1925. is as follows:
  1926. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  1927. * ``"pyarrow"``: returns pyarrow-backed nullable
  1928. :class:`ArrowDtype` :class:`DataFrame`
  1929. .. versionadded:: 2.0
  1930. Returns
  1931. -------
  1932. DataFrame
  1933. See Also
  1934. --------
  1935. pandas.read_sql_table
  1936. SQLDatabase.read_query
  1937. """
  1938. if coerce_float is not True:
  1939. raise NotImplementedError(
  1940. "'coerce_float' is not implemented for ADBC drivers"
  1941. )
  1942. if chunksize:
  1943. raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
  1944. if columns:
  1945. if index_col:
  1946. index_select = maybe_make_list(index_col)
  1947. else:
  1948. index_select = []
  1949. to_select = index_select + columns
  1950. select_list = ", ".join(f'"{x}"' for x in to_select)
  1951. else:
  1952. select_list = "*"
  1953. if schema:
  1954. stmt = f"SELECT {select_list} FROM {schema}.{table_name}"
  1955. else:
  1956. stmt = f"SELECT {select_list} FROM {table_name}"
  1957. with self.execute(stmt) as cur:
  1958. pa_table = cur.fetch_arrow_table()
  1959. df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
  1960. return _wrap_result_adbc(
  1961. df,
  1962. index_col=index_col,
  1963. parse_dates=parse_dates,
  1964. )
  1965. def read_query(
  1966. self,
  1967. sql: str,
  1968. index_col: str | list[str] | None = None,
  1969. coerce_float: bool = True,
  1970. parse_dates=None,
  1971. params=None,
  1972. chunksize: int | None = None,
  1973. dtype: DtypeArg | None = None,
  1974. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  1975. ) -> DataFrame | Iterator[DataFrame]:
  1976. """
  1977. Read SQL query into a DataFrame.
  1978. Parameters
  1979. ----------
  1980. sql : str
  1981. SQL query to be executed.
  1982. index_col : string, optional, default: None
  1983. Column name to use as index for the returned DataFrame object.
  1984. coerce_float : bool, default True
  1985. Raises NotImplementedError
  1986. params : list, tuple or dict, optional, default: None
  1987. Raises NotImplementedError
  1988. parse_dates : list or dict, default: None
  1989. - List of column names to parse as dates.
  1990. - Dict of ``{column_name: format string}`` where format string is
  1991. strftime compatible in case of parsing string times, or is one of
  1992. (D, s, ns, ms, us) in case of parsing integer timestamps.
  1993. - Dict of ``{column_name: arg dict}``, where the arg dict
  1994. corresponds to the keyword arguments of
  1995. :func:`pandas.to_datetime` Especially useful with databases
  1996. without native Datetime support, such as SQLite.
  1997. chunksize : int, default None
  1998. Raises NotImplementedError
  1999. dtype : Type name or dict of columns
  2000. Data type for data or columns. E.g. np.float64 or
  2001. {'a': np.float64, 'b': np.int32, 'c': 'Int64'}
  2002. Returns
  2003. -------
  2004. DataFrame
  2005. See Also
  2006. --------
  2007. read_sql_table : Read SQL database table into a DataFrame.
  2008. read_sql
  2009. """
  2010. if coerce_float is not True:
  2011. raise NotImplementedError(
  2012. "'coerce_float' is not implemented for ADBC drivers"
  2013. )
  2014. if params:
  2015. raise NotImplementedError("'params' is not implemented for ADBC drivers")
  2016. if chunksize:
  2017. raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
  2018. with self.execute(sql) as cur:
  2019. pa_table = cur.fetch_arrow_table()
  2020. df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
  2021. return _wrap_result_adbc(
  2022. df,
  2023. index_col=index_col,
  2024. parse_dates=parse_dates,
  2025. dtype=dtype,
  2026. )
  2027. read_sql = read_query
  2028. def to_sql(
  2029. self,
  2030. frame,
  2031. name: str,
  2032. if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
  2033. index: bool = True,
  2034. index_label=None,
  2035. schema: str | None = None,
  2036. chunksize: int | None = None,
  2037. dtype: DtypeArg | None = None,
  2038. method: Literal["multi"] | Callable | None = None,
  2039. engine: str = "auto",
  2040. **engine_kwargs,
  2041. ) -> int | None:
  2042. """
  2043. Write records stored in a DataFrame to a SQL database.
  2044. Parameters
  2045. ----------
  2046. frame : DataFrame
  2047. name : string
  2048. Name of SQL table.
  2049. if_exists : {'fail', 'replace', 'append'}, default 'fail'
  2050. - fail: If table exists, do nothing.
  2051. - replace: If table exists, drop it, recreate it, and insert data.
  2052. - append: If table exists, insert data. Create if does not exist.
  2053. - delete_rows: If a table exists, delete all records and insert data.
  2054. index : boolean, default True
  2055. Write DataFrame index as a column.
  2056. index_label : string or sequence, default None
  2057. Raises NotImplementedError
  2058. schema : string, default None
  2059. Name of SQL schema in database to write to (if database flavor
  2060. supports this). If specified, this overwrites the default
  2061. schema of the SQLDatabase object.
  2062. chunksize : int, default None
  2063. Raises NotImplementedError
  2064. dtype : single type or dict of column name to SQL type, default None
  2065. Raises NotImplementedError
  2066. method : {None', 'multi', callable}, default None
  2067. Raises NotImplementedError
  2068. engine : {'auto', 'sqlalchemy'}, default 'auto'
  2069. Raises NotImplementedError if not set to 'auto'
  2070. """
  2071. pa = import_optional_dependency("pyarrow")
  2072. from adbc_driver_manager import Error
  2073. if index_label:
  2074. raise NotImplementedError(
  2075. "'index_label' is not implemented for ADBC drivers"
  2076. )
  2077. if chunksize:
  2078. raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
  2079. if dtype:
  2080. raise NotImplementedError("'dtype' is not implemented for ADBC drivers")
  2081. if method:
  2082. raise NotImplementedError("'method' is not implemented for ADBC drivers")
  2083. if engine != "auto":
  2084. raise NotImplementedError(
  2085. "engine != 'auto' not implemented for ADBC drivers"
  2086. )
  2087. if schema:
  2088. table_name = f"{schema}.{name}"
  2089. else:
  2090. table_name = name
  2091. # pandas if_exists="append" will still create the
  2092. # table if it does not exist; ADBC is more explicit with append/create
  2093. # as applicable modes, so the semantics get blurred across
  2094. # the libraries
  2095. mode = "create"
  2096. if self.has_table(name, schema):
  2097. if if_exists == "fail":
  2098. raise ValueError(f"Table '{table_name}' already exists.")
  2099. elif if_exists == "replace":
  2100. sql_statement = f"DROP TABLE {table_name}"
  2101. self.execute(sql_statement).close()
  2102. elif if_exists == "append":
  2103. mode = "append"
  2104. elif if_exists == "delete_rows":
  2105. mode = "append"
  2106. self.delete_rows(name, schema)
  2107. try:
  2108. tbl = pa.Table.from_pandas(frame, preserve_index=index)
  2109. except pa.ArrowNotImplementedError as exc:
  2110. raise ValueError("datatypes not supported") from exc
  2111. with self.con.cursor() as cur:
  2112. try:
  2113. total_inserted = cur.adbc_ingest(
  2114. table_name=name, data=tbl, mode=mode, db_schema_name=schema
  2115. )
  2116. except Error as exc:
  2117. raise DatabaseError(
  2118. f"Failed to insert records on table={name} with {mode=}"
  2119. ) from exc
  2120. self.con.commit()
  2121. return total_inserted
  2122. def has_table(self, name: str, schema: str | None = None) -> bool:
  2123. meta = self.con.adbc_get_objects(
  2124. db_schema_filter=schema, table_name_filter=name
  2125. ).read_all()
  2126. for catalog_schema in meta["catalog_db_schemas"].to_pylist():
  2127. if not catalog_schema:
  2128. continue
  2129. for schema_record in catalog_schema:
  2130. if not schema_record:
  2131. continue
  2132. for table_record in schema_record["db_schema_tables"]:
  2133. if table_record["table_name"] == name:
  2134. return True
  2135. return False
  2136. def delete_rows(self, name: str, schema: str | None = None) -> None:
  2137. table_name = f"{schema}.{name}" if schema else name
  2138. if self.has_table(name, schema):
  2139. self.execute(f"DELETE FROM {table_name}").close()
  2140. def _create_sql_schema(
  2141. self,
  2142. frame: DataFrame,
  2143. table_name: str,
  2144. keys: list[str] | None = None,
  2145. dtype: DtypeArg | None = None,
  2146. schema: str | None = None,
  2147. ) -> str:
  2148. raise NotImplementedError("not implemented for adbc")
  2149. # sqlite-specific sql strings and handler class
  2150. # dictionary used for readability purposes
  2151. _SQL_TYPES = {
  2152. "string": "TEXT",
  2153. "floating": "REAL",
  2154. "integer": "INTEGER",
  2155. "datetime": "TIMESTAMP",
  2156. "date": "DATE",
  2157. "time": "TIME",
  2158. "boolean": "INTEGER",
  2159. }
  2160. def _get_unicode_name(name: object) -> str:
  2161. try:
  2162. uname = str(name).encode("utf-8", "strict").decode("utf-8")
  2163. except UnicodeError as err:
  2164. raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") from err
  2165. return uname
  2166. def _get_valid_sqlite_name(name: object) -> str:
  2167. # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\
  2168. # -for-sqlite-table-column-names-in-python
  2169. # Ensure the string can be encoded as UTF-8.
  2170. # Ensure the string does not include any NUL characters.
  2171. # Replace all " with "".
  2172. # Wrap the entire thing in double quotes.
  2173. uname = _get_unicode_name(name)
  2174. if not len(uname):
  2175. raise ValueError("Empty table or column name specified")
  2176. nul_index = uname.find("\x00")
  2177. if nul_index >= 0:
  2178. raise ValueError("SQLite identifier cannot contain NULs")
  2179. return '"' + uname.replace('"', '""') + '"'
  2180. class SQLiteTable(SQLTable):
  2181. """
  2182. Patch the SQLTable for fallback support.
  2183. Instead of a table variable just use the Create Table statement.
  2184. """
  2185. def __init__(self, *args, **kwargs) -> None:
  2186. super().__init__(*args, **kwargs)
  2187. self._register_date_adapters()
  2188. def _register_date_adapters(self) -> None:
  2189. # GH 8341
  2190. # register an adapter callable for datetime.time object
  2191. import sqlite3
  2192. # this will transform time(12,34,56,789) into '12:34:56.000789'
  2193. # (this is what sqlalchemy does)
  2194. def _adapt_time(t) -> str:
  2195. # This is faster than strftime
  2196. return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}"
  2197. # Also register adapters for date/datetime and co
  2198. # xref https://docs.python.org/3.12/library/sqlite3.html#adapter-and-converter-recipes
  2199. # Python 3.12+ doesn't auto-register adapters for us anymore
  2200. adapt_date_iso = lambda val: val.isoformat()
  2201. adapt_datetime_iso = lambda val: val.isoformat(" ")
  2202. sqlite3.register_adapter(time, _adapt_time)
  2203. sqlite3.register_adapter(date, adapt_date_iso)
  2204. sqlite3.register_adapter(datetime, adapt_datetime_iso)
  2205. convert_date = lambda val: date.fromisoformat(val.decode())
  2206. convert_timestamp = lambda val: datetime.fromisoformat(val.decode())
  2207. sqlite3.register_converter("date", convert_date)
  2208. sqlite3.register_converter("timestamp", convert_timestamp)
  2209. def sql_schema(self) -> str:
  2210. return str(";\n".join(self.table))
  2211. def _execute_create(self) -> None:
  2212. with self.pd_sql.run_transaction() as cur:
  2213. for stmt in self.table:
  2214. cur.execute(stmt)
  2215. def insert_statement(self, *, num_rows: int) -> str:
  2216. names = list(map(str, self.frame.columns))
  2217. wld = "?" # wildcard char
  2218. escape = _get_valid_sqlite_name
  2219. if self.index is not None:
  2220. for idx in self.index[::-1]:
  2221. names.insert(0, idx)
  2222. bracketed_names = [escape(column) for column in names]
  2223. col_names = ",".join(bracketed_names)
  2224. row_wildcards = ",".join([wld] * len(names))
  2225. wildcards = ",".join([f"({row_wildcards})" for _ in range(num_rows)])
  2226. insert_statement = (
  2227. f"INSERT INTO {escape(self.name)} ({col_names}) VALUES {wildcards}"
  2228. )
  2229. return insert_statement
  2230. def _execute_insert(self, conn, keys, data_iter) -> int:
  2231. from sqlite3 import Error
  2232. data_list = list(data_iter)
  2233. try:
  2234. conn.executemany(self.insert_statement(num_rows=1), data_list)
  2235. except Error as exc:
  2236. raise DatabaseError("Execution failed") from exc
  2237. return conn.rowcount
  2238. def _execute_insert_multi(self, conn, keys, data_iter) -> int:
  2239. data_list = list(data_iter)
  2240. flattened_data = [x for row in data_list for x in row]
  2241. conn.execute(self.insert_statement(num_rows=len(data_list)), flattened_data)
  2242. return conn.rowcount
  2243. def _create_table_setup(self):
  2244. """
  2245. Return a list of SQL statements that creates a table reflecting the
  2246. structure of a DataFrame. The first entry will be a CREATE TABLE
  2247. statement while the rest will be CREATE INDEX statements.
  2248. """
  2249. column_names_and_types = self._get_column_names_and_types(self._sql_type_name)
  2250. escape = _get_valid_sqlite_name
  2251. create_tbl_stmts = [
  2252. escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types
  2253. ]
  2254. if self.keys is not None and len(self.keys):
  2255. if not is_list_like(self.keys):
  2256. keys = [self.keys]
  2257. else:
  2258. keys = self.keys
  2259. cnames_br = ", ".join([escape(c) for c in keys])
  2260. create_tbl_stmts.append(
  2261. f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})"
  2262. )
  2263. if self.schema:
  2264. schema_name = self.schema + "."
  2265. else:
  2266. schema_name = ""
  2267. create_stmts = [
  2268. "CREATE TABLE "
  2269. + schema_name
  2270. + escape(self.name)
  2271. + " (\n"
  2272. + ",\n ".join(create_tbl_stmts)
  2273. + "\n)"
  2274. ]
  2275. ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
  2276. if ix_cols:
  2277. cnames = "_".join(ix_cols)
  2278. cnames_br = ",".join([escape(c) for c in ix_cols])
  2279. create_stmts.append(
  2280. "CREATE INDEX "
  2281. + escape("ix_" + self.name + "_" + cnames)
  2282. + "ON "
  2283. + escape(self.name)
  2284. + " ("
  2285. + cnames_br
  2286. + ")"
  2287. )
  2288. return create_stmts
  2289. def _sql_type_name(self, col):
  2290. dtype: DtypeArg = self.dtype or {}
  2291. if is_dict_like(dtype):
  2292. dtype = cast(dict, dtype)
  2293. if col.name in dtype:
  2294. return dtype[col.name]
  2295. # Infer type of column, while ignoring missing values.
  2296. # Needed for inserting typed data containing NULLs, GH 8778.
  2297. col_type = lib.infer_dtype(col, skipna=True)
  2298. if col_type == "timedelta64":
  2299. warnings.warn(
  2300. "the 'timedelta' type is not supported, and will be "
  2301. "written as integer values (ns frequency) to the database.",
  2302. UserWarning,
  2303. stacklevel=find_stack_level(),
  2304. )
  2305. col_type = "integer"
  2306. elif col_type == "datetime64":
  2307. col_type = "datetime"
  2308. elif col_type == "empty":
  2309. col_type = "string"
  2310. elif col_type == "complex":
  2311. raise ValueError("Complex datatypes not supported")
  2312. if col_type not in _SQL_TYPES:
  2313. col_type = "string"
  2314. return _SQL_TYPES[col_type]
  2315. class SQLiteDatabase(PandasSQL):
  2316. """
  2317. Version of SQLDatabase to support SQLite connections (fallback without
  2318. SQLAlchemy). This should only be used internally.
  2319. Parameters
  2320. ----------
  2321. con : sqlite connection object
  2322. """
  2323. def __init__(self, con) -> None:
  2324. self.con = con
  2325. @contextmanager
  2326. def run_transaction(self):
  2327. cur = self.con.cursor()
  2328. try:
  2329. yield cur
  2330. self.con.commit()
  2331. except Exception:
  2332. self.con.rollback()
  2333. raise
  2334. finally:
  2335. cur.close()
  2336. def execute(self, sql: str | Select | TextClause, params=None):
  2337. from sqlite3 import Error
  2338. if not isinstance(sql, str):
  2339. raise TypeError("Query must be a string unless using sqlalchemy.")
  2340. args = [] if params is None else [params]
  2341. cur = self.con.cursor()
  2342. try:
  2343. cur.execute(sql, *args)
  2344. return cur
  2345. except Error as exc:
  2346. try:
  2347. self.con.rollback()
  2348. except Error as inner_exc: # pragma: no cover
  2349. ex = DatabaseError(
  2350. f"Execution failed on sql: {sql}\n{exc}\nunable to rollback"
  2351. )
  2352. raise ex from inner_exc
  2353. ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
  2354. raise ex from exc
  2355. @staticmethod
  2356. def _query_iterator(
  2357. cursor,
  2358. chunksize: int,
  2359. columns,
  2360. index_col=None,
  2361. coerce_float: bool = True,
  2362. parse_dates=None,
  2363. dtype: DtypeArg | None = None,
  2364. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  2365. ) -> Generator[DataFrame]:
  2366. """Return generator through chunked result set"""
  2367. has_read_data = False
  2368. while True:
  2369. data = cursor.fetchmany(chunksize)
  2370. if type(data) == tuple:
  2371. data = list(data)
  2372. if not data:
  2373. cursor.close()
  2374. if not has_read_data:
  2375. result = DataFrame.from_records(
  2376. [], columns=columns, coerce_float=coerce_float
  2377. )
  2378. if dtype:
  2379. result = result.astype(dtype)
  2380. yield result
  2381. break
  2382. has_read_data = True
  2383. yield _wrap_result(
  2384. data,
  2385. columns,
  2386. index_col=index_col,
  2387. coerce_float=coerce_float,
  2388. parse_dates=parse_dates,
  2389. dtype=dtype,
  2390. dtype_backend=dtype_backend,
  2391. )
  2392. def read_query(
  2393. self,
  2394. sql,
  2395. index_col=None,
  2396. coerce_float: bool = True,
  2397. parse_dates=None,
  2398. params=None,
  2399. chunksize: int | None = None,
  2400. dtype: DtypeArg | None = None,
  2401. dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
  2402. ) -> DataFrame | Iterator[DataFrame]:
  2403. cursor = self.execute(sql, params)
  2404. columns = [col_desc[0] for col_desc in cursor.description]
  2405. if chunksize is not None:
  2406. return self._query_iterator(
  2407. cursor,
  2408. chunksize,
  2409. columns,
  2410. index_col=index_col,
  2411. coerce_float=coerce_float,
  2412. parse_dates=parse_dates,
  2413. dtype=dtype,
  2414. dtype_backend=dtype_backend,
  2415. )
  2416. else:
  2417. data = self._fetchall_as_list(cursor)
  2418. cursor.close()
  2419. frame = _wrap_result(
  2420. data,
  2421. columns,
  2422. index_col=index_col,
  2423. coerce_float=coerce_float,
  2424. parse_dates=parse_dates,
  2425. dtype=dtype,
  2426. dtype_backend=dtype_backend,
  2427. )
  2428. return frame
  2429. def _fetchall_as_list(self, cur):
  2430. result = cur.fetchall()
  2431. if not isinstance(result, list):
  2432. result = list(result)
  2433. return result
  2434. def to_sql(
  2435. self,
  2436. frame,
  2437. name: str,
  2438. if_exists: str = "fail",
  2439. index: bool = True,
  2440. index_label=None,
  2441. schema=None,
  2442. chunksize: int | None = None,
  2443. dtype: DtypeArg | None = None,
  2444. method: Literal["multi"] | Callable | None = None,
  2445. engine: str = "auto",
  2446. **engine_kwargs,
  2447. ) -> int | None:
  2448. """
  2449. Write records stored in a DataFrame to a SQL database.
  2450. Parameters
  2451. ----------
  2452. frame: DataFrame
  2453. name: string
  2454. Name of SQL table.
  2455. if_exists: {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
  2456. fail: If table exists, do nothing.
  2457. replace: If table exists, drop it, recreate it, and insert data.
  2458. append: If table exists, insert data. Create if it does not exist.
  2459. delete_rows: If a table exists, delete all records and insert data.
  2460. index : bool, default True
  2461. Write DataFrame index as a column
  2462. index_label : string or sequence, default None
  2463. Column label for index column(s). If None is given (default) and
  2464. `index` is True, then the index names are used.
  2465. A sequence should be given if the DataFrame uses MultiIndex.
  2466. schema : string, default None
  2467. Ignored parameter included for compatibility with SQLAlchemy
  2468. version of ``to_sql``.
  2469. chunksize : int, default None
  2470. If not None, then rows will be written in batches of this
  2471. size at a time. If None, all rows will be written at once.
  2472. dtype : single type or dict of column name to SQL type, default None
  2473. Optional specifying the datatype for columns. The SQL type should
  2474. be a string. If all columns are of the same type, one single value
  2475. can be used.
  2476. method : {None, 'multi', callable}, default None
  2477. Controls the SQL insertion clause used:
  2478. * None : Uses standard SQL ``INSERT`` clause (one per row).
  2479. * 'multi': Pass multiple values in a single ``INSERT`` clause.
  2480. * callable with signature ``(pd_table, conn, keys, data_iter)``.
  2481. Details and a sample callable implementation can be found in the
  2482. section :ref:`insert method <io.sql.method>`.
  2483. """
  2484. if dtype:
  2485. if not is_dict_like(dtype):
  2486. # error: Value expression in dictionary comprehension has incompatible
  2487. # type "Union[ExtensionDtype, str, dtype[Any], Type[object],
  2488. # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
  2489. # Type[str], Type[float], Type[int], Type[complex], Type[bool],
  2490. # Type[object]]]]"; expected type "Union[ExtensionDtype, str,
  2491. # dtype[Any], Type[object]]"
  2492. dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type]
  2493. else:
  2494. dtype = cast(dict, dtype)
  2495. for col, my_type in dtype.items():
  2496. if not isinstance(my_type, str):
  2497. raise ValueError(f"{col} ({my_type}) not a string")
  2498. table = SQLiteTable(
  2499. name,
  2500. self,
  2501. frame=frame,
  2502. index=index,
  2503. if_exists=if_exists,
  2504. index_label=index_label,
  2505. dtype=dtype,
  2506. )
  2507. table.create()
  2508. return table.insert(chunksize, method)
  2509. def has_table(self, name: str, schema: str | None = None) -> bool:
  2510. wld = "?"
  2511. query = f"""
  2512. SELECT
  2513. name
  2514. FROM
  2515. sqlite_master
  2516. WHERE
  2517. type IN ('table', 'view')
  2518. AND name={wld};
  2519. """
  2520. return len(self.execute(query, [name]).fetchall()) > 0
  2521. def get_table(self, table_name: str, schema: str | None = None) -> None:
  2522. return None # not supported in fallback mode
  2523. def drop_table(self, name: str, schema: str | None = None) -> None:
  2524. drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}"
  2525. self.execute(drop_sql).close()
  2526. def delete_rows(self, name: str, schema: str | None = None) -> None:
  2527. delete_sql = f"DELETE FROM {_get_valid_sqlite_name(name)}"
  2528. if self.has_table(name, schema):
  2529. self.execute(delete_sql).close()
  2530. def _create_sql_schema(
  2531. self,
  2532. frame,
  2533. table_name: str,
  2534. keys=None,
  2535. dtype: DtypeArg | None = None,
  2536. schema: str | None = None,
  2537. ) -> str:
  2538. table = SQLiteTable(
  2539. table_name,
  2540. self,
  2541. frame=frame,
  2542. index=False,
  2543. keys=keys,
  2544. dtype=dtype,
  2545. schema=schema,
  2546. )
  2547. return str(table.sql_schema())
  2548. def get_schema(
  2549. frame,
  2550. name: str,
  2551. keys=None,
  2552. con=None,
  2553. dtype: DtypeArg | None = None,
  2554. schema: str | None = None,
  2555. ) -> str:
  2556. """
  2557. Get the SQL db table schema for the given frame.
  2558. Parameters
  2559. ----------
  2560. frame : DataFrame
  2561. name : str
  2562. name of SQL table
  2563. keys : string or sequence, default: None
  2564. columns to use a primary key
  2565. con: ADBC Connection, SQLAlchemy connectable, sqlite3 connection, default: None
  2566. ADBC provides high performance I/O with native type support, where available.
  2567. Using SQLAlchemy makes it possible to use any DB supported by that
  2568. library
  2569. If a DBAPI2 object, only sqlite3 is supported.
  2570. dtype : dict of column name to SQL type, default None
  2571. Optional specifying the datatype for columns. The SQL type should
  2572. be a SQLAlchemy type, or a string for sqlite3 fallback connection.
  2573. schema: str, default: None
  2574. Optional specifying the schema to be used in creating the table.
  2575. """
  2576. with pandasSQL_builder(con=con) as pandas_sql:
  2577. return pandas_sql._create_sql_schema(
  2578. frame, name, keys=keys, dtype=dtype, schema=schema
  2579. )