| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648 |
- from collections.abc import Iterator
- from functools import partial
- from io import (
- BytesIO,
- StringIO,
- )
- import os
- from pathlib import Path
- import re
- import threading
- from urllib.error import URLError
- import numpy as np
- import pytest
- from pandas.compat import is_platform_windows
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import (
- NA,
- DataFrame,
- MultiIndex,
- Series,
- Timestamp,
- date_range,
- read_csv,
- read_html,
- to_datetime,
- )
- import pandas._testing as tm
- from pandas.io.common import file_path_to_url
- @pytest.fixture(
- params=[
- "chinese_utf-16.html",
- "chinese_utf-32.html",
- "chinese_utf-8.html",
- "letz_latin1.html",
- ]
- )
- def html_encoding_file(request, datapath):
- """Parametrized fixture for HTML encoding test filenames."""
- return datapath("io", "data", "html_encoding", request.param)
- def assert_framelist_equal(list1, list2, *args, **kwargs):
- assert len(list1) == len(list2), (
- "lists are not of equal size "
- f"len(list1) == {len(list1)}, "
- f"len(list2) == {len(list2)}"
- )
- msg = "not all list elements are DataFrames"
- both_frames = all(
- map(
- lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame),
- list1,
- list2,
- )
- )
- assert both_frames, msg
- for frame_i, frame_j in zip(list1, list2):
- tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
- assert not frame_i.empty, "frames are both empty"
- def test_bs4_version_fails(monkeypatch, datapath):
- bs4 = pytest.importorskip("bs4")
- pytest.importorskip("html5lib")
- monkeypatch.setattr(bs4, "__version__", "4.2")
- with pytest.raises(ImportError, match="Pandas requires version"):
- read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4")
- def test_invalid_flavor():
- url = "google.com"
- flavor = "invalid flavor"
- msg = r"\{" + flavor + r"\} is not a valid set of flavors"
- with pytest.raises(ValueError, match=msg):
- read_html(StringIO(url), match="google", flavor=flavor)
- def test_same_ordering(datapath):
- pytest.importorskip("bs4")
- pytest.importorskip("lxml")
- pytest.importorskip("html5lib")
- filename = datapath("io", "data", "html", "valid_markup.html")
- dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"])
- dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"])
- assert_framelist_equal(dfs_lxml, dfs_bs4)
- @pytest.fixture(
- params=[
- pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]),
- pytest.param("lxml", marks=td.skip_if_no("lxml")),
- ],
- )
- def flavor_read_html(request):
- return partial(read_html, flavor=request.param)
- class TestReadHtml:
- def test_literal_html_deprecation(self, flavor_read_html):
- # GH 53785
- msg = (
- "Passing literal html to 'read_html' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object."
- )
- with tm.assert_produces_warning(FutureWarning, match=msg):
- flavor_read_html(
- """<table>
- <thead>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>1</td>
- <td>2</td>
- </tr>
- </tbody>
- <tbody>
- <tr>
- <td>3</td>
- <td>4</td>
- </tr>
- </tbody>
- </table>"""
- )
- @pytest.fixture
- def spam_data(self, datapath):
- return datapath("io", "data", "html", "spam.html")
- @pytest.fixture
- def banklist_data(self, datapath):
- return datapath("io", "data", "html", "banklist.html")
- def test_to_html_compat(self, flavor_read_html):
- df = (
- DataFrame(
- np.random.default_rng(2).random((4, 3)),
- columns=pd.Index(list("abc")),
- )
- # pylint: disable-next=consider-using-f-string
- .map("{:.3f}".format).astype(float)
- )
- out = df.to_html()
- res = flavor_read_html(
- StringIO(out), attrs={"class": "dataframe"}, index_col=0
- )[0]
- tm.assert_frame_equal(res, df)
- def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
- # GH#50286
- df = DataFrame(
- {
- "a": Series([1, np.nan, 3], dtype="Int64"),
- "b": Series([1, 2, 3], dtype="Int64"),
- "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
- "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
- "e": [True, False, None],
- "f": [True, False, True],
- "g": ["a", "b", "c"],
- "h": ["a", "b", None],
- }
- )
- out = df.to_html(index=False)
- with pd.option_context("mode.string_storage", string_storage):
- result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0]
- if dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- string_dtype = pd.ArrowDtype(pa.string())
- else:
- string_dtype = pd.StringDtype(string_storage)
- expected = DataFrame(
- {
- "a": Series([1, np.nan, 3], dtype="Int64"),
- "b": Series([1, 2, 3], dtype="Int64"),
- "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
- "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
- "e": Series([True, False, NA], dtype="boolean"),
- "f": Series([True, False, True], dtype="boolean"),
- "g": Series(["a", "b", "c"], dtype=string_dtype),
- "h": Series(["a", "b", None], dtype=string_dtype),
- }
- )
- if dtype_backend == "pyarrow":
- import pyarrow as pa
- from pandas.arrays import ArrowExtensionArray
- expected = DataFrame(
- {
- col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
- for col in expected.columns
- }
- )
- # the storage of the str columns' Index is also affected by the
- # string_storage setting -> ignore that for checking the result
- tm.assert_frame_equal(result, expected, check_column_type=False)
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_banklist_url(self, httpserver, banklist_data, flavor_read_html):
- with open(banklist_data, encoding="utf-8") as f:
- httpserver.serve_content(content=f.read())
- df1 = flavor_read_html(
- # lxml cannot find attrs leave out for now
- httpserver.url,
- match="First Federal Bank of Florida", # attrs={"class": "dataTable"}
- )
- # lxml cannot find attrs leave out for now
- df2 = flavor_read_html(
- httpserver.url,
- match="Metcalf Bank",
- ) # attrs={"class": "dataTable"})
- assert_framelist_equal(df1, df2)
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_spam_url(self, httpserver, spam_data, flavor_read_html):
- with open(spam_data, encoding="utf-8") as f:
- httpserver.serve_content(content=f.read())
- df1 = flavor_read_html(httpserver.url, match=".*Water.*")
- df2 = flavor_read_html(httpserver.url, match="Unit")
- assert_framelist_equal(df1, df2)
- @pytest.mark.slow
- def test_banklist(self, banklist_data, flavor_read_html):
- df1 = flavor_read_html(
- banklist_data, match=".*Florida.*", attrs={"id": "table"}
- )
- df2 = flavor_read_html(
- banklist_data, match="Metcalf Bank", attrs={"id": "table"}
- )
- assert_framelist_equal(df1, df2)
- def test_spam(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*")
- df2 = flavor_read_html(spam_data, match="Unit")
- assert_framelist_equal(df1, df2)
- assert df1[0].iloc[0, 0] == "Proximates"
- assert df1[0].columns[0] == "Nutrient"
- def test_spam_no_match(self, spam_data, flavor_read_html):
- dfs = flavor_read_html(spam_data)
- for df in dfs:
- assert isinstance(df, DataFrame)
- def test_banklist_no_match(self, banklist_data, flavor_read_html):
- dfs = flavor_read_html(banklist_data, attrs={"id": "table"})
- for df in dfs:
- assert isinstance(df, DataFrame)
- def test_spam_header(self, spam_data, flavor_read_html):
- df = flavor_read_html(spam_data, match=".*Water.*", header=2)[0]
- assert df.columns[0] == "Proximates"
- assert not df.empty
- def test_skiprows_int(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1)
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=1)
- assert_framelist_equal(df1, df2)
- def test_skiprows_range(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=range(2))
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=range(2))
- assert_framelist_equal(df1, df2)
- def test_skiprows_list(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=[1, 2])
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=[2, 1])
- assert_framelist_equal(df1, df2)
- def test_skiprows_set(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows={1, 2})
- df2 = flavor_read_html(spam_data, match="Unit", skiprows={2, 1})
- assert_framelist_equal(df1, df2)
- def test_skiprows_slice(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1)
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=1)
- assert_framelist_equal(df1, df2)
- def test_skiprows_slice_short(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2))
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(2))
- assert_framelist_equal(df1, df2)
- def test_skiprows_slice_long(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5))
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1))
- assert_framelist_equal(df1, df2)
- def test_skiprows_ndarray(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=np.arange(2))
- df2 = flavor_read_html(spam_data, match="Unit", skiprows=np.arange(2))
- assert_framelist_equal(df1, df2)
- def test_skiprows_invalid(self, spam_data, flavor_read_html):
- with pytest.raises(TypeError, match=("is not a valid type for skipping rows")):
- flavor_read_html(spam_data, match=".*Water.*", skiprows="asdf")
- def test_index(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0)
- df2 = flavor_read_html(spam_data, match="Unit", index_col=0)
- assert_framelist_equal(df1, df2)
- def test_header_and_index_no_types(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0)
- df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0)
- assert_framelist_equal(df1, df2)
- def test_header_and_index_with_types(self, spam_data, flavor_read_html):
- df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0)
- df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0)
- assert_framelist_equal(df1, df2)
- def test_infer_types(self, spam_data, flavor_read_html):
- # 10892 infer_types removed
- df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0)
- df2 = flavor_read_html(spam_data, match="Unit", index_col=0)
- assert_framelist_equal(df1, df2)
- def test_string_io(self, spam_data, flavor_read_html):
- with open(spam_data, encoding="UTF-8") as f:
- data1 = StringIO(f.read())
- with open(spam_data, encoding="UTF-8") as f:
- data2 = StringIO(f.read())
- df1 = flavor_read_html(data1, match=".*Water.*")
- df2 = flavor_read_html(data2, match="Unit")
- assert_framelist_equal(df1, df2)
- def test_string(self, spam_data, flavor_read_html):
- with open(spam_data, encoding="UTF-8") as f:
- data = f.read()
- df1 = flavor_read_html(StringIO(data), match=".*Water.*")
- df2 = flavor_read_html(StringIO(data), match="Unit")
- assert_framelist_equal(df1, df2)
- def test_file_like(self, spam_data, flavor_read_html):
- with open(spam_data, encoding="UTF-8") as f:
- df1 = flavor_read_html(f, match=".*Water.*")
- with open(spam_data, encoding="UTF-8") as f:
- df2 = flavor_read_html(f, match="Unit")
- assert_framelist_equal(df1, df2)
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_bad_url_protocol(self, httpserver, flavor_read_html):
- httpserver.serve_content("urlopen error unknown url type: git", code=404)
- with pytest.raises(URLError, match="urlopen error unknown url type: git"):
- flavor_read_html("git://github.com", match=".*Water.*")
- @pytest.mark.slow
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_invalid_url(self, httpserver, flavor_read_html):
- httpserver.serve_content("Name or service not known", code=404)
- with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"):
- flavor_read_html(httpserver.url, match=".*Water.*")
- @pytest.mark.slow
- def test_file_url(self, banklist_data, flavor_read_html):
- url = banklist_data
- dfs = flavor_read_html(
- file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"}
- )
- assert isinstance(dfs, list)
- for df in dfs:
- assert isinstance(df, DataFrame)
- @pytest.mark.slow
- def test_invalid_table_attrs(self, banklist_data, flavor_read_html):
- url = banklist_data
- with pytest.raises(ValueError, match="No tables found"):
- flavor_read_html(
- url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"}
- )
- @pytest.mark.slow
- def test_multiindex_header(self, banklist_data, flavor_read_html):
- df = flavor_read_html(
- banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1]
- )[0]
- assert isinstance(df.columns, MultiIndex)
- @pytest.mark.slow
- def test_multiindex_index(self, banklist_data, flavor_read_html):
- df = flavor_read_html(
- banklist_data, match="Metcalf", attrs={"id": "table"}, index_col=[0, 1]
- )[0]
- assert isinstance(df.index, MultiIndex)
- @pytest.mark.slow
- def test_multiindex_header_index(self, banklist_data, flavor_read_html):
- df = flavor_read_html(
- banklist_data,
- match="Metcalf",
- attrs={"id": "table"},
- header=[0, 1],
- index_col=[0, 1],
- )[0]
- assert isinstance(df.columns, MultiIndex)
- assert isinstance(df.index, MultiIndex)
- @pytest.mark.slow
- def test_multiindex_header_skiprows_tuples(self, banklist_data, flavor_read_html):
- df = flavor_read_html(
- banklist_data,
- match="Metcalf",
- attrs={"id": "table"},
- header=[0, 1],
- skiprows=1,
- )[0]
- assert isinstance(df.columns, MultiIndex)
- @pytest.mark.slow
- def test_multiindex_header_skiprows(self, banklist_data, flavor_read_html):
- df = flavor_read_html(
- banklist_data,
- match="Metcalf",
- attrs={"id": "table"},
- header=[0, 1],
- skiprows=1,
- )[0]
- assert isinstance(df.columns, MultiIndex)
- @pytest.mark.slow
- def test_multiindex_header_index_skiprows(self, banklist_data, flavor_read_html):
- df = flavor_read_html(
- banklist_data,
- match="Metcalf",
- attrs={"id": "table"},
- header=[0, 1],
- index_col=[0, 1],
- skiprows=1,
- )[0]
- assert isinstance(df.index, MultiIndex)
- assert isinstance(df.columns, MultiIndex)
- @pytest.mark.slow
- def test_regex_idempotency(self, banklist_data, flavor_read_html):
- url = banklist_data
- dfs = flavor_read_html(
- file_path_to_url(os.path.abspath(url)),
- match=re.compile(re.compile("Florida")),
- attrs={"id": "table"},
- )
- assert isinstance(dfs, list)
- for df in dfs:
- assert isinstance(df, DataFrame)
- def test_negative_skiprows(self, spam_data, flavor_read_html):
- msg = r"\(you passed a negative value\)"
- with pytest.raises(ValueError, match=msg):
- flavor_read_html(spam_data, match="Water", skiprows=-1)
- @pytest.fixture
- def python_docs(self):
- return """
- <table class="contentstable" align="center"><tr>
- <td width="50%">
- <p class="biglink"><a class="biglink" href="whatsnew/2.7.html">What's new in Python 2.7?</a><br/>
- <span class="linkdescr">or <a href="whatsnew/index.html">all "What's new" documents</a> since 2.0</span></p>
- <p class="biglink"><a class="biglink" href="tutorial/index.html">Tutorial</a><br/>
- <span class="linkdescr">start here</span></p>
- <p class="biglink"><a class="biglink" href="library/index.html">Library Reference</a><br/>
- <span class="linkdescr">keep this under your pillow</span></p>
- <p class="biglink"><a class="biglink" href="reference/index.html">Language Reference</a><br/>
- <span class="linkdescr">describes syntax and language elements</span></p>
- <p class="biglink"><a class="biglink" href="using/index.html">Python Setup and Usage</a><br/>
- <span class="linkdescr">how to use Python on different platforms</span></p>
- <p class="biglink"><a class="biglink" href="howto/index.html">Python HOWTOs</a><br/>
- <span class="linkdescr">in-depth documents on specific topics</span></p>
- </td><td width="50%">
- <p class="biglink"><a class="biglink" href="installing/index.html">Installing Python Modules</a><br/>
- <span class="linkdescr">installing from the Python Package Index & other sources</span></p>
- <p class="biglink"><a class="biglink" href="distributing/index.html">Distributing Python Modules</a><br/>
- <span class="linkdescr">publishing modules for installation by others</span></p>
- <p class="biglink"><a class="biglink" href="extending/index.html">Extending and Embedding</a><br/>
- <span class="linkdescr">tutorial for C/C++ programmers</span></p>
- <p class="biglink"><a class="biglink" href="c-api/index.html">Python/C API</a><br/>
- <span class="linkdescr">reference for C/C++ programmers</span></p>
- <p class="biglink"><a class="biglink" href="faq/index.html">FAQs</a><br/>
- <span class="linkdescr">frequently asked questions (with answers!)</span></p>
- </td></tr>
- </table>
- <p><strong>Indices and tables:</strong></p>
- <table class="contentstable" align="center"><tr>
- <td width="50%">
- <p class="biglink"><a class="biglink" href="py-modindex.html">Python Global Module Index</a><br/>
- <span class="linkdescr">quick access to all modules</span></p>
- <p class="biglink"><a class="biglink" href="genindex.html">General Index</a><br/>
- <span class="linkdescr">all functions, classes, terms</span></p>
- <p class="biglink"><a class="biglink" href="glossary.html">Glossary</a><br/>
- <span class="linkdescr">the most important terms explained</span></p>
- </td><td width="50%">
- <p class="biglink"><a class="biglink" href="search.html">Search page</a><br/>
- <span class="linkdescr">search this documentation</span></p>
- <p class="biglink"><a class="biglink" href="contents.html">Complete Table of Contents</a><br/>
- <span class="linkdescr">lists all sections and subsections</span></p>
- </td></tr>
- </table>
- """ # noqa: E501
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_multiple_matches(self, python_docs, httpserver, flavor_read_html):
- httpserver.serve_content(content=python_docs)
- dfs = flavor_read_html(httpserver.url, match="Python")
- assert len(dfs) > 1
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_python_docs_table(self, python_docs, httpserver, flavor_read_html):
- httpserver.serve_content(content=python_docs)
- dfs = flavor_read_html(httpserver.url, match="Python")
- zz = [df.iloc[0, 0][0:4] for df in dfs]
- assert sorted(zz) == ["Pyth", "What"]
- def test_empty_tables(self, flavor_read_html):
- """
- Make sure that read_html ignores empty tables.
- """
- html = """
- <table>
- <thead>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>1</td>
- <td>2</td>
- </tr>
- </tbody>
- </table>
- <table>
- <tbody>
- </tbody>
- </table>
- """
- result = flavor_read_html(StringIO(html))
- assert len(result) == 1
- def test_multiple_tbody(self, flavor_read_html):
- # GH-20690
- # Read all tbody tags within a single table.
- result = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>1</td>
- <td>2</td>
- </tr>
- </tbody>
- <tbody>
- <tr>
- <td>3</td>
- <td>4</td>
- </tr>
- </tbody>
- </table>"""
- )
- )[0]
- expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- def test_header_and_one_column(self, flavor_read_html):
- """
- Don't fail with bs4 when there is a header and only one column
- as described in issue #9178
- """
- result = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr>
- <th>Header</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>first</td>
- </tr>
- </tbody>
- </table>"""
- )
- )[0]
- expected = DataFrame(data={"Header": "first"}, index=[0])
- tm.assert_frame_equal(result, expected)
- def test_thead_without_tr(self, flavor_read_html):
- """
- Ensure parser adds <tr> within <thead> on malformed HTML.
- """
- result = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr>
- <th>Country</th>
- <th>Municipality</th>
- <th>Year</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>Ukraine</td>
- <th>Odessa</th>
- <td>1944</td>
- </tr>
- </tbody>
- </table>"""
- )
- )[0]
- expected = DataFrame(
- data=[["Ukraine", "Odessa", 1944]],
- columns=["Country", "Municipality", "Year"],
- )
- tm.assert_frame_equal(result, expected)
- def test_tfoot_read(self, flavor_read_html):
- """
- Make sure that read_html reads tfoot, containing td or th.
- Ignores empty tfoot
- """
- data_template = """<table>
- <thead>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>bodyA</td>
- <td>bodyB</td>
- </tr>
- </tbody>
- <tfoot>
- {footer}
- </tfoot>
- </table>"""
- expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"])
- expected2 = DataFrame(
- data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"]
- )
- data1 = data_template.format(footer="")
- data2 = data_template.format(footer="<tr><td>footA</td><th>footB</th></tr>")
- result1 = flavor_read_html(StringIO(data1))[0]
- result2 = flavor_read_html(StringIO(data2))[0]
- tm.assert_frame_equal(result1, expected1)
- tm.assert_frame_equal(result2, expected2)
- def test_parse_header_of_non_string_column(self, flavor_read_html):
- # GH5048: if header is specified explicitly, an int column should be
- # parsed as int while its header is parsed as str
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <td>S</td>
- <td>I</td>
- </tr>
- <tr>
- <td>text</td>
- <td>1944</td>
- </tr>
- </table>
- """
- ),
- header=0,
- )[0]
- expected = DataFrame([["text", 1944]], columns=("S", "I"))
- tm.assert_frame_equal(result, expected)
- @pytest.mark.slow
- def test_banklist_header(self, banklist_data, datapath, flavor_read_html):
- from pandas.io.html import _remove_whitespace
- def try_remove_ws(x):
- try:
- return _remove_whitespace(x)
- except AttributeError:
- return x
- df = flavor_read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0]
- ground_truth = read_csv(
- datapath("io", "data", "csv", "banklist.csv"),
- converters={"Updated Date": Timestamp, "Closing Date": Timestamp},
- )
- assert df.shape == ground_truth.shape
- old = [
- "First Vietnamese American Bank In Vietnamese",
- "Westernbank Puerto Rico En Espanol",
- "R-G Premier Bank of Puerto Rico En Espanol",
- "Eurobank En Espanol",
- "Sanderson State Bank En Espanol",
- "Washington Mutual Bank (Including its subsidiary Washington "
- "Mutual Bank FSB)",
- "Silver State Bank En Espanol",
- "AmTrade International Bank En Espanol",
- "Hamilton Bank, NA En Espanol",
- "The Citizens Savings Bank Pioneer Community Bank, Inc.",
- ]
- new = [
- "First Vietnamese American Bank",
- "Westernbank Puerto Rico",
- "R-G Premier Bank of Puerto Rico",
- "Eurobank",
- "Sanderson State Bank",
- "Washington Mutual Bank",
- "Silver State Bank",
- "AmTrade International Bank",
- "Hamilton Bank, NA",
- "The Citizens Savings Bank",
- ]
- dfnew = df.map(try_remove_ws).replace(old, new)
- gtnew = ground_truth.map(try_remove_ws)
- converted = dfnew
- date_cols = ["Closing Date", "Updated Date"]
- converted[date_cols] = converted[date_cols].apply(to_datetime)
- tm.assert_frame_equal(converted, gtnew)
- @pytest.mark.slow
- def test_gold_canyon(self, banklist_data, flavor_read_html):
- gc = "Gold Canyon"
- with open(banklist_data, encoding="utf-8") as f:
- raw_text = f.read()
- assert gc in raw_text
- df = flavor_read_html(
- banklist_data, match="Gold Canyon", attrs={"id": "table"}
- )[0]
- assert gc in df.to_string()
- def test_different_number_of_cols(self, flavor_read_html):
- expected = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr style="text-align: right;">
- <th></th>
- <th>C_l0_g0</th>
- <th>C_l0_g1</th>
- <th>C_l0_g2</th>
- <th>C_l0_g3</th>
- <th>C_l0_g4</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <th>R_l0_g0</th>
- <td> 0.763</td>
- <td> 0.233</td>
- <td> nan</td>
- <td> nan</td>
- <td> nan</td>
- </tr>
- <tr>
- <th>R_l0_g1</th>
- <td> 0.244</td>
- <td> 0.285</td>
- <td> 0.392</td>
- <td> 0.137</td>
- <td> 0.222</td>
- </tr>
- </tbody>
- </table>"""
- ),
- index_col=0,
- )[0]
- result = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr style="text-align: right;">
- <th></th>
- <th>C_l0_g0</th>
- <th>C_l0_g1</th>
- <th>C_l0_g2</th>
- <th>C_l0_g3</th>
- <th>C_l0_g4</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <th>R_l0_g0</th>
- <td> 0.763</td>
- <td> 0.233</td>
- </tr>
- <tr>
- <th>R_l0_g1</th>
- <td> 0.244</td>
- <td> 0.285</td>
- <td> 0.392</td>
- <td> 0.137</td>
- <td> 0.222</td>
- </tr>
- </tbody>
- </table>"""
- ),
- index_col=0,
- )[0]
- tm.assert_frame_equal(result, expected)
- def test_colspan_rowspan_1(self, flavor_read_html):
- # GH17054
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <th>A</th>
- <th colspan="1">B</th>
- <th rowspan="1">C</th>
- </tr>
- <tr>
- <td>a</td>
- <td>b</td>
- <td>c</td>
- </tr>
- </table>
- """
- )
- )[0]
- expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"])
- tm.assert_frame_equal(result, expected)
- def test_colspan_rowspan_copy_values(self, flavor_read_html):
- # GH17054
- # In ASCII, with lowercase letters being copies:
- #
- # X x Y Z W
- # A B b z C
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <td colspan="2">X</td>
- <td>Y</td>
- <td rowspan="2">Z</td>
- <td>W</td>
- </tr>
- <tr>
- <td>A</td>
- <td colspan="2">B</td>
- <td>C</td>
- </tr>
- </table>
- """
- ),
- header=0,
- )[0]
- expected = DataFrame(
- data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"]
- )
- tm.assert_frame_equal(result, expected)
- def test_colspan_rowspan_both_not_1(self, flavor_read_html):
- # GH17054
- # In ASCII, with lowercase letters being copies:
- #
- # A B b b C
- # a b b b D
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <td rowspan="2">A</td>
- <td rowspan="2" colspan="3">B</td>
- <td>C</td>
- </tr>
- <tr>
- <td>D</td>
- </tr>
- </table>
- """
- ),
- header=0,
- )[0]
- expected = DataFrame(
- data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"]
- )
- tm.assert_frame_equal(result, expected)
- def test_rowspan_at_end_of_row(self, flavor_read_html):
- # GH17054
- # In ASCII, with lowercase letters being copies:
- #
- # A B
- # C b
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <td>A</td>
- <td rowspan="2">B</td>
- </tr>
- <tr>
- <td>C</td>
- </tr>
- </table>
- """
- ),
- header=0,
- )[0]
- expected = DataFrame(data=[["C", "B"]], columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- def test_rowspan_only_rows(self, flavor_read_html):
- # GH17054
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <td rowspan="3">A</td>
- <td rowspan="3">B</td>
- </tr>
- </table>
- """
- ),
- header=0,
- )[0]
- expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
- # GH17054
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- <tr>
- <th>a</th>
- <th>b</th>
- </tr>
- <tr>
- <td>1</td>
- <td>2</td>
- </tr>
- </table>
- """
- )
- )[0]
- columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]])
- expected = DataFrame(data=[[1, 2]], columns=columns)
- tm.assert_frame_equal(result, expected)
- def test_parse_dates_list(self, flavor_read_html):
- df = DataFrame({"date": date_range("1/1/2001", periods=10)})
- expected = df.to_html()
- res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0)
- tm.assert_frame_equal(df, res[0])
- res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0)
- tm.assert_frame_equal(df, res[0])
- def test_parse_dates_combine(self, flavor_read_html):
- raw_dates = Series(date_range("1/1/2001", periods=10))
- df = DataFrame(
- {
- "date": raw_dates.map(lambda x: str(x.date())),
- "time": raw_dates.map(lambda x: str(x.time())),
- }
- )
- res = flavor_read_html(
- StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1
- )
- newdf = DataFrame({"datetime": raw_dates})
- tm.assert_frame_equal(newdf, res[0])
- def test_wikipedia_states_table(self, datapath, flavor_read_html):
- data = datapath("io", "data", "html", "wikipedia_states.html")
- assert os.path.isfile(data), f"{repr(data)} is not a file"
- assert os.path.getsize(data), f"{repr(data)} is an empty file"
- result = flavor_read_html(data, match="Arizona", header=1)[0]
- assert result.shape == (60, 12)
- assert "Unnamed" in result.columns[-1]
- assert result["sq mi"].dtype == np.dtype("float64")
- assert np.allclose(result.loc[0, "sq mi"], 665384.04)
- def test_wikipedia_states_multiindex(self, datapath, flavor_read_html):
- data = datapath("io", "data", "html", "wikipedia_states.html")
- result = flavor_read_html(data, match="Arizona", index_col=0)[0]
- assert result.shape == (60, 11)
- assert "Unnamed" in result.columns[-1][1]
- assert result.columns.nlevels == 2
- assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04)
- def test_parser_error_on_empty_header_row(self, flavor_read_html):
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <thead>
- <tr><th></th><th></tr>
- <tr><th>A</th><th>B</th></tr>
- </thead>
- <tbody>
- <tr><td>a</td><td>b</td></tr>
- </tbody>
- </table>
- """
- ),
- header=[0, 1],
- )
- expected = DataFrame(
- [["a", "b"]],
- columns=MultiIndex.from_tuples(
- [("Unnamed: 0_level_0", "A"), ("Unnamed: 1_level_0", "B")]
- ),
- )
- tm.assert_frame_equal(result[0], expected)
- def test_decimal_rows(self, flavor_read_html):
- # GH 12907
- result = flavor_read_html(
- StringIO(
- """<html>
- <body>
- <table>
- <thead>
- <tr>
- <th>Header</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>1100#101</td>
- </tr>
- </tbody>
- </table>
- </body>
- </html>"""
- ),
- decimal="#",
- )[0]
- expected = DataFrame(data={"Header": 1100.101}, index=[0])
- assert result["Header"].dtype == np.dtype("float64")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("arg", [True, False])
- def test_bool_header_arg(self, spam_data, arg, flavor_read_html):
- # GH 6114
- msg = re.escape(
- "Passing a bool to header is invalid. Use header=None for no header or "
- "header=int or list-like of ints to specify the row(s) making up the "
- "column names"
- )
- with pytest.raises(TypeError, match=msg):
- flavor_read_html(spam_data, header=arg)
- def test_converters(self, flavor_read_html):
- # GH 13461
- result = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr>
- <th>a</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td> 0.763</td>
- </tr>
- <tr>
- <td> 0.244</td>
- </tr>
- </tbody>
- </table>"""
- ),
- converters={"a": str},
- )[0]
- expected = DataFrame({"a": ["0.763", "0.244"]})
- tm.assert_frame_equal(result, expected)
- def test_na_values(self, flavor_read_html):
- # GH 13461
- result = flavor_read_html(
- StringIO(
- """<table>
- <thead>
- <tr>
- <th>a</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td> 0.763</td>
- </tr>
- <tr>
- <td> 0.244</td>
- </tr>
- </tbody>
- </table>"""
- ),
- na_values=[0.244],
- )[0]
- expected = DataFrame({"a": [0.763, np.nan]})
- tm.assert_frame_equal(result, expected)
- def test_keep_default_na(self, flavor_read_html):
- html_data = """<table>
- <thead>
- <tr>
- <th>a</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td> N/A</td>
- </tr>
- <tr>
- <td> NA</td>
- </tr>
- </tbody>
- </table>"""
- expected_df = DataFrame({"a": ["N/A", "NA"]})
- html_df = flavor_read_html(StringIO(html_data), keep_default_na=False)[0]
- tm.assert_frame_equal(expected_df, html_df)
- expected_df = DataFrame({"a": [np.nan, np.nan]})
- html_df = flavor_read_html(StringIO(html_data), keep_default_na=True)[0]
- tm.assert_frame_equal(expected_df, html_df)
- def test_preserve_empty_rows(self, flavor_read_html):
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- <tr>
- <td>a</td>
- <td>b</td>
- </tr>
- <tr>
- <td></td>
- <td></td>
- </tr>
- </table>
- """
- )
- )[0]
- expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html):
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <thead>
- <tr><th></th><th></tr>
- <tr><th>A</th><th>B</th></tr>
- <tr><th>a</th><th>b</th></tr>
- </thead>
- <tbody>
- <tr><td>1</td><td>2</td></tr>
- </tbody>
- </table>
- """
- )
- )[0]
- columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]])
- expected = DataFrame(data=[[1, 2]], columns=columns)
- tm.assert_frame_equal(result, expected)
- def test_multiple_header_rows(self, flavor_read_html):
- # Issue #13434
- expected_df = DataFrame(
- data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]
- )
- expected_df.columns = [
- ["Unnamed: 0_level_0", "Age", "Party"],
- ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"],
- ]
- html = expected_df.to_html(index=False)
- html_df = flavor_read_html(StringIO(html))[0]
- tm.assert_frame_equal(expected_df, html_df)
- def test_works_on_valid_markup(self, datapath, flavor_read_html):
- filename = datapath("io", "data", "html", "valid_markup.html")
- dfs = flavor_read_html(filename, index_col=0)
- assert isinstance(dfs, list)
- assert isinstance(dfs[0], DataFrame)
- @pytest.mark.slow
- def test_fallback_success(self, datapath, flavor_read_html):
- banklist_data = datapath("io", "data", "html", "banklist.html")
- flavor_read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"])
- def test_to_html_timestamp(self):
- rng = date_range("2000-01-01", periods=10)
- df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=rng)
- result = df.to_html()
- assert "2000-01-01" in result
- def test_to_html_borderless(self):
- df = DataFrame([{"A": 1, "B": 2}])
- out_border_default = df.to_html()
- out_border_true = df.to_html(border=True)
- out_border_explicit_default = df.to_html(border=1)
- out_border_nondefault = df.to_html(border=2)
- out_border_zero = df.to_html(border=0)
- out_border_false = df.to_html(border=False)
- assert ' border="1"' in out_border_default
- assert out_border_true == out_border_default
- assert out_border_default == out_border_explicit_default
- assert out_border_default != out_border_nondefault
- assert ' border="2"' in out_border_nondefault
- assert ' border="0"' not in out_border_zero
- assert " border" not in out_border_false
- assert out_border_zero == out_border_false
- @pytest.mark.parametrize(
- "displayed_only,exp0,exp1",
- [
- (True, DataFrame(["foo"]), None),
- (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])),
- ],
- )
- def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html):
- # GH 20027
- data = """<html>
- <body>
- <table>
- <tr>
- <td>
- foo
- <span style="display:none;text-align:center">bar</span>
- <span style="display:none">baz</span>
- <span style="display: none">qux</span>
- </td>
- </tr>
- </table>
- <table style="display: none">
- <tr>
- <td>foo</td>
- </tr>
- </table>
- </body>
- </html>"""
- dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only)
- tm.assert_frame_equal(dfs[0], exp0)
- if exp1 is not None:
- tm.assert_frame_equal(dfs[1], exp1)
- else:
- assert len(dfs) == 1 # Should not parse hidden table
- @pytest.mark.parametrize("displayed_only", [True, False])
- def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_html):
- html_table = """
- <table>
- <tr>
- <th>A</th>
- <th>B</th>
- </tr>
- <tr>
- <td>1</td>
- <td>2</td>
- </tr>
- <tr>
- <td><span style="display:none"></span>4</td>
- <td>5</td>
- </tr>
- </table>
- """
- result = flavor_read_html(StringIO(html_table), displayed_only=displayed_only)[
- 0
- ]
- expected = DataFrame({"A": [1, 4], "B": [2, 5]})
- tm.assert_frame_equal(result, expected)
- @td.skip_if_windows()
- @pytest.mark.filterwarnings(
- "ignore:You provided Unicode markup but also provided a value for "
- "from_encoding.*:UserWarning"
- )
- def test_encode(self, html_encoding_file, flavor_read_html):
- base_path = os.path.basename(html_encoding_file)
- root = os.path.splitext(base_path)[0]
- _, encoding = root.split("_")
- try:
- with open(html_encoding_file, "rb") as fobj:
- from_string = flavor_read_html(
- fobj.read(), encoding=encoding, index_col=0
- ).pop()
- with open(html_encoding_file, "rb") as fobj:
- from_file_like = flavor_read_html(
- BytesIO(fobj.read()), encoding=encoding, index_col=0
- ).pop()
- from_filename = flavor_read_html(
- html_encoding_file, encoding=encoding, index_col=0
- ).pop()
- tm.assert_frame_equal(from_string, from_file_like)
- tm.assert_frame_equal(from_string, from_filename)
- except Exception:
- # seems utf-16/32 fail on windows
- if is_platform_windows():
- if "16" in encoding or "32" in encoding:
- pytest.skip()
- raise
- def test_parse_failure_unseekable(self, flavor_read_html):
- # Issue #17975
- if flavor_read_html.keywords.get("flavor") == "lxml":
- pytest.skip("Not applicable for lxml")
- class UnseekableStringIO(StringIO):
- def seekable(self):
- return False
- bad = UnseekableStringIO(
- """
- <table><tr><td>spam<foobr />eggs</td></tr></table>"""
- )
- assert flavor_read_html(bad)
- with pytest.raises(ValueError, match="passed a non-rewindable file object"):
- flavor_read_html(bad)
- def test_parse_failure_rewinds(self, flavor_read_html):
- # Issue #17975
- class MockFile:
- def __init__(self, data) -> None:
- self.data = data
- self.at_end = False
- def read(self, size=None):
- data = "" if self.at_end else self.data
- self.at_end = True
- return data
- def seek(self, offset):
- self.at_end = False
- def seekable(self):
- return True
- # GH 49036 pylint checks for presence of __next__ for iterators
- def __next__(self):
- ...
- def __iter__(self) -> Iterator:
- # `is_file_like` depends on the presence of
- # the __iter__ attribute.
- return self
- good = MockFile("<table><tr><td>spam<br />eggs</td></tr></table>")
- bad = MockFile("<table><tr><td>spam<foobr />eggs</td></tr></table>")
- assert flavor_read_html(good)
- assert flavor_read_html(bad)
- @pytest.mark.slow
- @pytest.mark.single_cpu
- def test_importcheck_thread_safety(self, datapath, flavor_read_html):
- # see gh-16928
- class ErrorThread(threading.Thread):
- def run(self):
- try:
- super().run()
- except Exception as err:
- self.err = err
- else:
- self.err = None
- filename = datapath("io", "data", "html", "valid_markup.html")
- helper_thread1 = ErrorThread(target=flavor_read_html, args=(filename,))
- helper_thread2 = ErrorThread(target=flavor_read_html, args=(filename,))
- helper_thread1.start()
- helper_thread2.start()
- while helper_thread1.is_alive() or helper_thread2.is_alive():
- pass
- assert None is helper_thread1.err is helper_thread2.err
- def test_parse_path_object(self, datapath, flavor_read_html):
- # GH 37705
- file_path_string = datapath("io", "data", "html", "spam.html")
- file_path = Path(file_path_string)
- df1 = flavor_read_html(file_path_string)[0]
- df2 = flavor_read_html(file_path)[0]
- tm.assert_frame_equal(df1, df2)
- def test_parse_br_as_space(self, flavor_read_html):
- # GH 29528: pd.read_html() convert <br> to space
- result = flavor_read_html(
- StringIO(
- """
- <table>
- <tr>
- <th>A</th>
- </tr>
- <tr>
- <td>word1<br>word2</td>
- </tr>
- </table>
- """
- )
- )[0]
- expected = DataFrame(data=[["word1 word2"]], columns=["A"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
- def test_extract_links(self, arg, flavor_read_html):
- gh_13141_data = """
- <table>
- <tr>
- <th>HTTP</th>
- <th>FTP</th>
- <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
- </tr>
- <tr>
- <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
- <td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
- <td>Linkless</td>
- </tr>
- <tfoot>
- <tr>
- <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
- <td>
- Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
- </td>
- </tr>
- </tfoot>
- </table>
- """
- gh_13141_expected = {
- "head_ignore": ["HTTP", "FTP", "Linkless"],
- "head_extract": [
- ("HTTP", None),
- ("FTP", None),
- ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
- ],
- "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
- "body_extract": [
- ("Wikipedia", "https://en.wikipedia.org/"),
- ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
- ("Linkless", None),
- ],
- "footer_ignore": [
- "Footer",
- "Multiple links: Only first captured.",
- None,
- ],
- "footer_extract": [
- ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
- ("Multiple links: Only first captured.", "1"),
- None,
- ],
- }
- data_exp = gh_13141_expected["body_ignore"]
- foot_exp = gh_13141_expected["footer_ignore"]
- head_exp = gh_13141_expected["head_ignore"]
- if arg == "all":
- data_exp = gh_13141_expected["body_extract"]
- foot_exp = gh_13141_expected["footer_extract"]
- head_exp = gh_13141_expected["head_extract"]
- elif arg == "body":
- data_exp = gh_13141_expected["body_extract"]
- elif arg == "footer":
- foot_exp = gh_13141_expected["footer_extract"]
- elif arg == "header":
- head_exp = gh_13141_expected["head_extract"]
- result = flavor_read_html(StringIO(gh_13141_data), extract_links=arg)[0]
- expected = DataFrame([data_exp, foot_exp], columns=head_exp)
- expected = expected.fillna(np.nan)
- tm.assert_frame_equal(result, expected)
- def test_extract_links_bad(self, spam_data):
- msg = (
- "`extract_links` must be one of "
- '{None, "header", "footer", "body", "all"}, got "incorrect"'
- )
- with pytest.raises(ValueError, match=msg):
- read_html(spam_data, extract_links="incorrect")
- def test_extract_links_all_no_header(self, flavor_read_html):
- # GH 48316
- data = """
- <table>
- <tr>
- <td>
- <a href='https://google.com'>Google.com</a>
- </td>
- </tr>
- </table>
- """
- result = flavor_read_html(StringIO(data), extract_links="all")[0]
- expected = DataFrame([[("Google.com", "https://google.com")]])
- tm.assert_frame_equal(result, expected)
- def test_invalid_dtype_backend(self):
- msg = (
- "dtype_backend numpy is invalid, only 'numpy_nullable' and "
- "'pyarrow' are allowed."
- )
- with pytest.raises(ValueError, match=msg):
- read_html("test", dtype_backend="numpy")
- def test_style_tag(self, flavor_read_html):
- # GH 48316
- data = """
- <table>
- <tr>
- <th>
- <style>.style</style>
- A
- </th>
- <th>B</th>
- </tr>
- <tr>
- <td>A1</td>
- <td>B1</td>
- </tr>
- <tr>
- <td>A2</td>
- <td>B2</td>
- </tr>
- </table>
- """
- result = flavor_read_html(StringIO(data))[0]
- expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
|