Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor MIME type detection #893

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions isic/ingest/models/accession.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,7 @@ def generate_blob(self):
try:
with self.original_blob.open("rb") as original_blob_stream:
blob_mime_type = guess_mime_type(original_blob_stream, self.original_blob_name)
blob_major_mime_type = blob_mime_type.partition("/")[0]
if blob_major_mime_type != "image":
if blob_mime_type.major != "image":
raise InvalidBlobError( # noqa: TRY301
f'Blob has a non-image MIME type: "{blob_mime_type}"'
)
Expand Down
6 changes: 3 additions & 3 deletions isic/ingest/tests/test_utils_mime.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pathlib

from isic.ingest.utils.mime import guess_mime_type
from isic.ingest.utils.mime import MimeType, guess_mime_type

data_dir = pathlib.Path(__file__).parent / "data"

Expand All @@ -11,7 +11,7 @@ def test_utils_mime_guess_mime_type_consistent(caplog):
with file_path.open("rb") as stream:
mime_type = guess_mime_type(stream, file_path.name)

assert mime_type == "image/jpeg"
assert mime_type == MimeType("image/jpeg")
assert not any("Inconsistent MIME types" in msg for msg in caplog.messages)


Expand All @@ -21,7 +21,7 @@ def test_utils_mime_guess_mime_type_inconsistent(caplog):
with file_path.open("rb") as stream:
mime_type = guess_mime_type(stream, "ISIC_0000000.gif")

assert mime_type == "image/jpeg"
assert mime_type == MimeType("image/jpeg")
message = next((msg for msg in caplog.messages if "Inconsistent MIME types" in msg), None)
assert message
assert '"image/gif"' in message
36 changes: 25 additions & 11 deletions isic/ingest/utils/mime.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,55 @@
from dataclasses import dataclass
import logging
import mimetypes
import shutil
import tempfile
from typing import IO

import magic
from magic import Magic

logger = logging.getLogger(__name__)


def guess_mime_type(content: IO[bytes], source_filename: str | None = None) -> str:
@dataclass
class MimeType:
major: str
minor: str

def __init__(self, mime_type: str) -> None:
self.major, _, self.minor = mime_type.partition("/")

def __str__(self) -> str:
return f"{self.major}/{self.minor}"


def guess_mime_type(content: IO[bytes], source_filename: str | None = None) -> MimeType:
"""
Guess the MIME type of a file, based on its content.

An optional `filename` can be provided, to provide extra context for guessing.
"""
m = magic.Magic(mime=True)
magic = Magic(mime=True)

# This initial seek is just defensive
content.seek(0)
with tempfile.SpooledTemporaryFile() as file_stream:
# Copy blob_stream into a SpooledTemporaryFile so it can be used by magic,
with tempfile.TemporaryFile() as file_stream:
# Copy blob_stream into a TemporaryFile so it can be used by magic,
# which does not accept a file-like object
shutil.copyfileobj(content, file_stream)
file_stream.seek(0)

# Calling .fileno() forces the file to be flushed to disk
content_mime_type = m.from_descriptor(file_stream.fileno())
content_mime_type = MimeType(magic.from_descriptor(file_stream.fileno()))
content.seek(0)

if source_filename is not None:
source_filename_mime_type = mimetypes.guess_type(source_filename, strict=False)[0]
if source_filename_mime_type is not None and source_filename_mime_type != content_mime_type:
# Right now, do not rely on `filename_mime_type` for the return value, but
source_filename_mime_type = MimeType(
mimetypes.guess_type(source_filename, strict=False)[0] or "application/octet-stream"
)
if source_filename_mime_type != content_mime_type:
# Right now, do not rely on `source_filename_mime_type` for the return value, but
# warn if it's inconsistent with the content.
logger.warning(
'Inconsistent MIME types: content "%s", filename %s "%s"',
'Inconsistent MIME types: content is "%s", filename "%s" is "%s"',
content_mime_type,
source_filename,
source_filename_mime_type,
Expand Down
Loading