Skip to content

Commit

Permalink
[api] Improve normal file upload design for public APIs (#3878)
Browse files Browse the repository at this point in the history
- The original file upload implementation is super old and directly uploads the file by checking the available upload handlers set in `settings.py` when the request reaches the first middleware.
- This flow did not follow the current Hue filebrowser design of routing the FS calls via ProxyFS. 
- The design also had historical flaws of uploading the file even if request was meant for other file operation (if you send a file with /copy call, it will still upload the file and then try doing the copy operation).
- There was no flexibility for upload pre-checks and error handling. Most of such stuff was shifted post file upload.


- The new implementation helps in solving all the above problem along with cleaner code, improved design, upload on-demand, greater control and performance improvements.
- The new implementation is only available in API form and does not affect or modifies the older implementation (both are available in parallel for the time being) because we are in the process of phasing out the old filebrowser.
- Once the new filebrowser is upto speed, the old implementation will be removed.
  • Loading branch information
Harshg999 authored Nov 21, 2024
1 parent ff1e56e commit 577b140
Show file tree
Hide file tree
Showing 19 changed files with 785 additions and 182 deletions.
106 changes: 37 additions & 69 deletions apps/filebrowser/src/filebrowser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import operator
import mimetypes
import posixpath
from io import BytesIO as string_io

from django.core.paginator import EmptyPage, Paginator
from django.http import HttpResponse, HttpResponseNotModified, HttpResponseRedirect, StreamingHttpResponse
Expand All @@ -46,6 +47,7 @@
FILE_DOWNLOAD_CACHE_CONTROL,
MAX_FILE_SIZE_UPLOAD_LIMIT,
REDIRECT_DOWNLOAD,
RESTRICT_FILE_EXTENSIONS,
SHOW_DOWNLOAD_BUTTON,
)
from filebrowser.lib import xxd
Expand Down Expand Up @@ -394,81 +396,46 @@ def upload_complete(request):

@api_error_handler
def upload_file(request):
"""
A wrapper around the actual upload view function to clean up the temporary file afterwards if it fails.
Returns JSON.
"""
pass
# response = {}

# try:
# response = _upload_file(request)
# except Exception as e:
# LOG.exception('Upload operation failed.')

# file = request.FILES.get('file')
# if file and hasattr(file, 'remove'): # TODO: Call from proxyFS -- Check feasibility of this old comment
# file.remove()

# return HttpResponse(str(e).split('\n', 1)[0], status=500) # TODO: Check error message and status code

# return JsonResponse(response)
# Read request body first to prevent RawPostDataException later on which occurs when trying to access body after it has already been read
body_data_bytes = string_io(request.body)


def _upload_file(request):
"""
Handles file uploaded by HDFSfileUploadHandler.
The uploaded file is stored in HDFS at its destination with a .tmp suffix.
We just need to rename it to the destination path.
"""
uploaded_file = request.FILES['file']
dest_path = request.GET.get('dest')
response = {}

if MAX_FILE_SIZE_UPLOAD_LIMIT.get() >= 0 and uploaded_file.size > MAX_FILE_SIZE_UPLOAD_LIMIT.get():
return HttpResponse(f'File exceeds maximum allowed size of {MAX_FILE_SIZE_UPLOAD_LIMIT.get()} bytes.', status=500)
dest_path = request.POST.get('destination_path')

# Use form for now to triger the upload handler process by Django.
# Might be a better solution now to try directly using handler in request.fs.upload() for all FS.
# form = UploadAPIFileForm(request.POST, request.FILES)
# Check if the file type is restricted
_, file_type = os.path.splitext(uploaded_file.name)
if RESTRICT_FILE_EXTENSIONS.get() and file_type.lower() in [ext.lower() for ext in RESTRICT_FILE_EXTENSIONS.get()]:
return HttpResponse(f'File type "{file_type}" is not allowed. Please choose a file with a different type.', status=400)

if request.META.get('upload_failed'):
raise Exception(request.META.get('upload_failed')) # TODO: Check error message and status code
# Check if the file size exceeds the maximum allowed size
max_size = MAX_FILE_SIZE_UPLOAD_LIMIT.get()
if max_size >= 0 and uploaded_file.size >= max_size:
return HttpResponse(f'File exceeds maximum allowed size of {max_size} bytes. Please upload a smaller file.', status=413)

# if not form.is_valid():
# raise Exception(f"Error in upload form: {form.errors}")
# Check if the destination path is a directory and the file name contains a path separator
# This prevents directory traversal attacks
if request.fs.isdir(dest_path) and posixpath.sep in uploaded_file.name:
return HttpResponse(f'Invalid filename. Path separators are not allowed.', status=400)

# Check if the file already exists at the destination path
filepath = request.fs.join(dest_path, uploaded_file.name)
if request.fs.exists(filepath):
return HttpResponse(f'The file path {filepath} already exists.', status=409)

if request.fs.isdir(dest_path) and posixpath.sep in uploaded_file.name:
raise Exception(f'Upload failed: {posixpath.sep} is not allowed in the filename {uploaded_file.name}.') # TODO: status code
# Check if the destination path already exists or not
if not request.fs.exists(dest_path):
return HttpResponse(f'The destination path {dest_path} does not exist.', status=404)

try:
request.fs.upload(file=uploaded_file, path=dest_path, username=request.user.username)
except IOError as ex:
already_exists = False
try:
already_exists = request.fs.exists(dest_path)
except Exception:
pass

if already_exists:
messsage = f'Upload failed: Destination {filepath} already exists.'
else:
messsage = f'Upload error: Copy to {filepath} failed: {str(ex)}'
raise Exception(messsage) # TODO: Check error messages above and status code
request.fs.upload_v1(request.META, input_data=body_data_bytes, destination=dest_path, username=request.user.username)
except Exception as ex:
return HttpResponse(f'Upload to {filepath} failed: {str(ex)}', status=500)

# TODO: Check response fields below
response.update(
{
'path': filepath,
'result': _massage_stats(request, stat_absolute_path(filepath, request.fs.stats(filepath))),
}
)
response = {
'uploaded_file_stats': _massage_stats(request, stat_absolute_path(filepath, request.fs.stats(filepath))),
}

return response
return JsonResponse(response)


@api_error_handler
Expand Down Expand Up @@ -766,7 +733,6 @@ def bulk_op(request, op):

error_dict = {}
for p in path_list:

tmp_dict = bulk_dict
if op in (copy, move):
tmp_dict['source_path'] = p
Expand Down Expand Up @@ -795,10 +761,12 @@ def _massage_stats(request, stats):
stats_dict = stats.to_json_dict()
normalized_path = request.fs.normpath(stats_dict.get('path'))

stats_dict.update({
'path': normalized_path,
'type': filetype(stats.mode),
'rwx': rwx(stats.mode, stats.aclBit),
})
stats_dict.update(
{
'path': normalized_path,
'type': filetype(stats.mode),
'rwx': rwx(stats.mode, stats.aclBit),
}
)

return stats_dict
234 changes: 234 additions & 0 deletions apps/filebrowser/src/filebrowser/api_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
#!/usr/bin/env python
# Licensed to Cloudera, Inc. under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. Cloudera, Inc. licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from unittest.mock import Mock, patch

from django.core.files.uploadedfile import SimpleUploadedFile

from filebrowser.api import upload_file
from filebrowser.conf import (
MAX_FILE_SIZE_UPLOAD_LIMIT,
RESTRICT_FILE_EXTENSIONS,
)


class TestNormalFileUpload:
def test_file_upload_success(self):
with patch('filebrowser.api.string_io') as string_io:
with patch('filebrowser.api.stat_absolute_path') as stat_absolute_path:
with patch('filebrowser.api._massage_stats') as _massage_stats:
request = Mock(
method='POST',
META=Mock(),
POST={'destination_path': 's3a://test-bucket/test-user/'},
FILES={'file': SimpleUploadedFile('test_file.txt', b'Hello World!')},
body=Mock(),
fs=Mock(
join=Mock(),
exists=Mock(side_effect=[False, True]),
isdir=Mock(return_value=False),
upload_v1=Mock(return_value=None),
stats=Mock(),
),
)

_massage_stats.return_value = {
"path": "s3a://test-bucket/test-user/test_file.txt",
"size": 12,
"atime": 1731527617,
"mtime": 1731527620,
"mode": 33188,
"user": "test-user",
"group": "test-user",
"blockSize": 134217728,
"replication": 3,
"type": "file",
"rwx": "-rw-r--r--",
}

resets = [
RESTRICT_FILE_EXTENSIONS.set_for_testing(''),
MAX_FILE_SIZE_UPLOAD_LIMIT.set_for_testing(-1),
]
try:
response = upload_file(request)
print(response.content)
response_data = json.loads(response.content)

assert response.status_code == 200
assert response_data['uploaded_file_stats'] == {
"path": "s3a://test-bucket/test-user/test_file.txt",
"size": 12,
"atime": 1731527617,
"mtime": 1731527620,
"mode": 33188,
"user": "test-user",
"group": "test-user",
"blockSize": 134217728,
"replication": 3,
"type": "file",
"rwx": "-rw-r--r--",
}
finally:
for reset in resets:
reset()

def test_upload_invalid_file_type(self):
with patch('filebrowser.api.string_io') as string_io:
request = Mock(
method='POST',
META=Mock(),
POST={'destination_path': 's3a://test-bucket/test-user/'},
FILES={'file': SimpleUploadedFile('test_file.txt', b'Hello World!')},
body=Mock(),
fs=Mock(
join=Mock(),
exists=Mock(side_effect=[False, True]),
isdir=Mock(return_value=False),
upload_v1=Mock(return_value=None),
stats=Mock(),
),
)
resets = [
RESTRICT_FILE_EXTENSIONS.set_for_testing('.exe,.txt'),
MAX_FILE_SIZE_UPLOAD_LIMIT.set_for_testing(-1),
]
try:
response = upload_file(request)

assert response.status_code == 400
assert response.content.decode('utf-8') == 'File type ".txt" is not allowed. Please choose a file with a different type.'
finally:
for reset in resets:
reset()

def test_upload_file_exceeds_max_size(self):
with patch('filebrowser.api.string_io') as string_io:
request = Mock(
method='POST',
META=Mock(),
POST={'destination_path': 's3a://test-bucket/test-user/'},
FILES={'file': SimpleUploadedFile('test_file.txt', b'Hello World!')},
body=Mock(),
fs=Mock(
join=Mock(),
exists=Mock(side_effect=[False, True]),
isdir=Mock(return_value=False),
upload_v1=Mock(return_value=None),
stats=Mock(),
),
)
resets = [
RESTRICT_FILE_EXTENSIONS.set_for_testing(''),
MAX_FILE_SIZE_UPLOAD_LIMIT.set_for_testing(5),
]
try:
response = upload_file(request)

assert response.status_code == 413
assert response.content.decode('utf-8') == 'File exceeds maximum allowed size of 5 bytes. Please upload a smaller file.'
finally:
for reset in resets:
reset()

def test_upload_file_already_exists(self):
with patch('filebrowser.api.string_io') as string_io:
request = Mock(
method='POST',
META=Mock(),
POST={'destination_path': 's3a://test-bucket/test-user/'},
FILES={'file': SimpleUploadedFile('test_file.txt', b'Hello World!')},
body=Mock(),
fs=Mock(
join=Mock(return_value='s3a://test-bucket/test-user/test_file.txt'),
exists=Mock(return_value=True),
isdir=Mock(return_value=True),
upload_v1=Mock(return_value=None),
stats=Mock(),
),
)
resets = [
RESTRICT_FILE_EXTENSIONS.set_for_testing(''),
MAX_FILE_SIZE_UPLOAD_LIMIT.set_for_testing(-1),
]
try:
response = upload_file(request)

assert response.status_code == 409
assert response.content.decode('utf-8') == 'The file path s3a://test-bucket/test-user/test_file.txt already exists.'
finally:
for reset in resets:
reset()

def test_destination_path_does_not_exists(self):
with patch('filebrowser.api.string_io') as string_io:
request = Mock(
method='POST',
META=Mock(),
POST={'destination_path': 's3a://test-bucket/test-user/'},
FILES={'file': SimpleUploadedFile('test_file.txt', b'Hello World!')},
body=Mock(),
fs=Mock(
join=Mock(),
exists=Mock(return_value=False),
isdir=Mock(return_value=True),
upload_v1=Mock(return_value=None),
stats=Mock(),
),
)
resets = [
RESTRICT_FILE_EXTENSIONS.set_for_testing(''),
MAX_FILE_SIZE_UPLOAD_LIMIT.set_for_testing(-1),
]
try:
response = upload_file(request)

assert response.status_code == 404
assert response.content.decode('utf-8') == 'The destination path s3a://test-bucket/test-user/ does not exist.'
finally:
for reset in resets:
reset()

def test_file_upload_failure(self):
with patch('filebrowser.api.string_io') as string_io:
request = Mock(
method='POST',
META=Mock(),
POST={'destination_path': 's3a://test-bucket/test-user/'},
FILES={'file': SimpleUploadedFile('test_file.txt', b'Hello World!')},
body=Mock(),
fs=Mock(
join=Mock(return_value='s3a://test-bucket/test-user/test_file.txt'),
exists=Mock(side_effect=[False, True]),
isdir=Mock(return_value=True),
upload_v1=Mock(side_effect=Exception('Upload exception occured!')),
stats=Mock(),
),
)
resets = [
RESTRICT_FILE_EXTENSIONS.set_for_testing(''),
MAX_FILE_SIZE_UPLOAD_LIMIT.set_for_testing(-1),
]
try:
response = upload_file(request)

assert response.status_code == 500
assert response.content.decode('utf-8') == 'Upload to s3a://test-bucket/test-user/test_file.txt failed: Upload exception occured!'
finally:
for reset in resets:
reset()
Loading

0 comments on commit 577b140

Please sign in to comment.