Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Table.to_arrow() and Table.to_panda() more robust when there is … #339

Merged
merged 1 commit into from
Jul 4, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 80 additions & 96 deletions src/fmu/sumo/explorer/objects/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,40 +22,58 @@ def __init__(self, sumo: SumoClient, metadata: dict) -> None:
self._dataframe = None
self._arrowtable = None
self._logger = logging.getLogger("__name__" + ".Table")

self._blob = None

def _get_blob(self):
if self._blob is None:
self._blob = self.blob
return self._blob

async def _get_blob_async(self):
if self._blob is None:
self._blob = await self.blob_async
return self._blob

def _read_table(self):
return self._construct_table_from_blob(self._get_blob())

async def _read_table_async(self):
return self._construct_table_from_blob(await self._get_blob_async())

def _construct_table_from_blob(self, blob):
try:
if self.dataformat == "csv":
dataframe = pd.read_csv(blob)
elif self.dataformat == "parquet":
dataframe = pd.read_parquet(blob)
elif self.dataformat == "arrow":
dataframe = pf.read_feather(blob)
else:
raise TypeError(f"Don't know how to convert a blob of format {self.dataformat} to a pandas table.")
except Exception as ex0:
try:
dataframe = pd.read_csv(blob)
except Exception as ex:
try:
dataframe = pd.read_parquet(blob)
except Exception as ex:
try:
dataframe = pf.read_feather(blob)
except Exception as ex:
raise TypeError(f"Unable to convert a blob of format {self.dataformat} to pandas table; tried csv, parquet and feather.")
pass
pass
pass
return dataframe

def to_pandas(self) -> pd.DataFrame:
"""Return object as a pandas DataFrame

Returns:
DataFrame: A DataFrame object
"""

if self._dataframe is None:
if self["data"]["format"] == "csv":
worked = "csv"
self._logger.debug("Treating blob as csv")
try:
self._dataframe = pd.read_csv(self.blob)
worked = "csv"

except UnicodeDecodeError as ud_e:
raise UnicodeDecodeError("Maybe not csv?") from ud_e
else:
try:
worked = "feather"
self._dataframe = pf.read_feather(self.blob)
except pa.lib.ArrowInvalid:
try:
worked = "parquet"
self._dataframe = pd.read_parquet(self.blob)

except UnicodeDecodeError as ud_error:
raise TypeError(
"Come on, no way this is converting to pandas!!"
) from ud_error

self._logger.debug("Read blob as %s to return pandas", worked)
self._dataframe = self._read_table()
return self._dataframe

async def to_pandas_async(self) -> pd.DataFrame:
Expand All @@ -64,34 +82,45 @@ async def to_pandas_async(self) -> pd.DataFrame:
Returns:
DataFrame: A DataFrame object
"""

if self._dataframe is None:
if self["data"]["format"] == "csv":
worked = "csv"
self._logger.debug("Treating blob as csv")
try:
self._dataframe = pd.read_csv(await self.blob_async)
worked = "csv"
self._dataframe = await self._read_table_async()
return self._dataframe

except UnicodeDecodeError as ud_e:
raise UnicodeDecodeError("Maybe not csv?") from ud_e
def _read_arrow(self):
return self._construct_arrow_from_blob(self._get_blob())

async def _read_arrow_async(self):
return self._construct_arrow_from_blob(await self._get_blob_async())

def _construct_arrow_from_blob(self, blob):
try:
if self.dataformat == "csv":
arrowtable = pa.Table.from_pandas(
pd.read_csv(blob)
)
elif self.dataformat == "parquet":
arrowtable = pq.read_table(blob)
elif self.dataformat == "arrow":
arrowtable = pf.read_table(blob)
else:
raise TypeError(f"Don't know how to convert a blob of format {self.dataformat} to a pandas table.")
except Exception as ex0:
try:
arrowtable = pa.Table.from_pandas(
pd.read_csv(blob)
)
except Exception as ex:
try:
worked = "feather"
self._dataframe = pf.read_feather(await self.blob_async)
except pa.lib.ArrowInvalid:
arrowtable = pq.read_table(blob)
except Exception as ex:
try:
worked = "parquet"
self._dataframe = pd.read_parquet(await self.blob_async)

except UnicodeDecodeError as ud_error:
raise TypeError(
"Come on, no way this is converting to pandas!!"
) from ud_error

self._logger.debug("Read blob as %s to return pandas", worked)
return self._dataframe

arrowtable = pf.read_table(selfblob)
except Exception as ex:
raise TypeError(f"Unable to convert a blob of format {self.dataformat} to arrow; tried csv, parquet and feather.")
pass
pass
pass
return arrowtable

def to_arrow(self) -> pa.Table:
"""Return object as an arrow Table
Expand All @@ -100,31 +129,7 @@ def to_arrow(self) -> pa.Table:
pa.Table: _description_
"""
if self._arrowtable is None:
if self["data"]["format"] == "parquet":
worked = "parquet"
self._arrowtable = pq.read_table(self.blob)
elif self["data"]["format"] == "arrow":
try:
worked = "feather"
self._arrowtable = pf.read_table(self.blob)
except pa.lib.ArrowInvalid:
worked = "parquet"
self._arrowtable = pq.read_table(self.blob)
else:
warn(
"Reading csv format into arrow, you will not get the full benefit of native arrow"
)
worked = "csv"
try:
self._arrowtable = pa.Table.from_pandas(
pd.read_csv(self.blob)
)

except TypeError as type_err:
raise OSError("Cannot read this into arrow") from type_err

self._logger.debug("Read blob as %s to return arrow", worked)

self._arrowtable = self._read_arrow()
return self._arrowtable

async def to_arrow_async(self) -> pa.Table:
Expand All @@ -134,26 +139,5 @@ async def to_arrow_async(self) -> pa.Table:
pa.Table: _description_
"""
if self._arrowtable is None:
if self["data"]["format"] == "arrow":
try:
worked = "feather"
self._arrowtable = pf.read_table(await self.blob_async)
except pa.lib.ArrowInvalid:
worked = "parquet"
self._arrowtable = pq.read_table(await self.blob_async)
else:
warn(
"Reading csv format into arrow, you will not get the full benefit of native arrow"
)
worked = "csv"
try:
self._arrowtable = pa.Table.from_pandas(
pd.read_csv(await self.blob_async)
)

except TypeError as type_err:
raise OSError("Cannot read this into arrow") from type_err

self._logger.debug("Read blob as %s to return arrow", worked)

self._arrowtable = await self._read_arrow_async()
return self._arrowtable