From 246472ee9e5bdffbddfecfdc464d24bcddf2ee19 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Thu, 26 Sep 2024 17:07:43 -0400 Subject: [PATCH 1/4] convert orphan files deletion task to async using celery task. Update celery config to make it possible --- Dockerfile | 2 +- pyproject.toml | 2 +- src/apps/analytics/tasks.py | 72 +++++++++++++++ src/apps/api/urls.py | 1 + src/apps/api/views/analytics.py | 51 ++++++----- src/settings/base.py | 2 + src/static/js/ours/client.js | 3 + src/static/riot/analytics/analytics.tag | 113 +++++++++++++++++++++--- 8 files changed, 204 insertions(+), 42 deletions(-) diff --git a/Dockerfile b/Dockerfile index eed648e54..d67eb0830 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9 +FROM python:3.9.19 RUN apt-get update && apt-get install -y gcc build-essential && rm -rf /var/lib/apt/lists/* diff --git a/pyproject.toml b/pyproject.toml index 39538b909..6d53d4561 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ channels = "2.4" channels-redis = "3.2.0" django-extra-fields = "0.9" pillow = "8.0.1" -celery = "4.2.1" +celery = "4.4.7" gunicorn = "20.0.4" urllib3 = ">=1.21.1,<1.25" uvicorn = {version = "0.13.3", extras = ["standard"]} diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py index b277f4cc8..833132789 100644 --- a/src/apps/analytics/tasks.py +++ b/src/apps/analytics/tasks.py @@ -1,3 +1,4 @@ +import os import time import logging from celery_config import app @@ -570,3 +571,74 @@ def reset_computed_storage_analytics(): elapsed_time ) ) + +@app.task(queue="site-worker") +def delete_orphan_files(): + logger.info("Task delete_orphan_files started") + + # Find most recent file + most_recent_log_file = get_most_recent_storage_inconsistency_log_file(logger) + if not most_recent_log_file: + logger.warning("No storage inconsistency log file found. Nothing will be removed") + raise Exception("No storage inconsistency log file found") + + # Get the list of orphan files from the content of the most recent log file + log_folder = "/app/logs/" + orphan_files_path = get_files_path_from_orphan_log_file(os.path.join(log_folder, most_recent_log_file), logger) + + # Delete those files in batch (max 1000 element at once) + batch_size = 1000 + for i in range(0, len(orphan_files_path), batch_size): + batch = orphan_files_path[i:i + batch_size] + objects_formatted = [{'Key': path} for path in batch] + BundleStorage.bucket.delete_objects(Delete={'Objects': objects_formatted}) + + logger.info("Delete oprhan files finished") + + +def get_most_recent_storage_inconsistency_log_file(logger): + log_folder = "/app/logs/" + try: + log_files = [f for f in os.listdir(log_folder) if os.path.isfile(os.path.join(log_folder, f))] + except FileNotFoundError: + logger.info(f"Folder '{log_folder}' does not exist.") + return None + + most_recent_log_file = None + most_recent_datetime = None + datetime_format = "%Y%m%d-%H%M%S" + for file in log_files: + try: + basename = os.path.basename(file) + datetime_str = basename[len("db_storage_inconsistency_"):-len(".log")] + file_datetime = datetime.strptime(datetime_str, datetime_format) + if most_recent_datetime is None or file_datetime > most_recent_datetime: + most_recent_datetime = file_datetime + most_recent_log_file = file + except ValueError: + logger.warning(f"Filename '{file}' does not match the expected format and will be ignored.") + + return most_recent_log_file + +def get_files_path_from_orphan_log_file(log_file_path, logger): + files_path = [] + + try: + with open(log_file_path) as log_file: + lines = log_file.readlines() + orphan_files_lines = [] + for i, line in enumerate(lines): + if "Orphaned files" in line: + orphan_files_lines = lines[i + 1:] + break + + for orphan_files_line in orphan_files_lines: + files_path.append(orphan_files_line.split(maxsplit=1)[0]) + except FileNotFoundError: + logger.error(f"File '{log_file_path}' does not exist.") + except PermissionError: + logger.error(f"Permission denied for reading the file '{log_file_path}'.") + except IOError as e: + logger.error(f"An I/O error occurred while accessing the file at {log_file_path}: {e}") + + return files_path diff --git a/src/apps/api/urls.py b/src/apps/api/urls.py index 517b7c50f..6fc171c1c 100644 --- a/src/apps/api/urls.py +++ b/src/apps/api/urls.py @@ -69,6 +69,7 @@ path('analytics/users_usage/', analytics.users_usage, name='users_usage'), path('analytics/delete_orphan_files/', analytics.delete_orphan_files, name="delete_orphan_files"), path('analytics/get_orphan_files/', analytics.get_orphan_files, name="get_orphan_files"), + path('analytics/check_orphans_deletion_status/', analytics.check_orphans_deletion_status, name="check_orphans_deletion_status"), # API Docs re_path(r'docs(?P\.json|\.yaml)$', schema_view.without_ui(cache_timeout=0), name='schema-json'), diff --git a/src/apps/api/views/analytics.py b/src/apps/api/views/analytics.py index 922d0f438..5d482bcb9 100644 --- a/src/apps/api/views/analytics.py +++ b/src/apps/api/views/analytics.py @@ -12,7 +12,7 @@ from competitions.models import Competition, Submission from analytics.models import StorageUsageHistory, CompetitionStorageDataPoint, UserStorageDataPoint from api.serializers.analytics import AnalyticsSerializer -from utils.storage import BundleStorage +from apps.analytics.tasks import delete_orphan_files as delete_orphan_files_async_task import os import datetime @@ -22,6 +22,7 @@ User = get_user_model() +delete_orphan_files_task = None class SimpleFilterBackend(BaseFilterBackend): @@ -318,36 +319,16 @@ def get_orphan_files(request): @api_view(["DELETE"]) def delete_orphan_files(request): """ - Delete all orphan files from the storage based on the last storage analytics + Start the deletion of orphan files task """ if not request.user.is_superuser: raise PermissionDenied(detail="Admin only") + + global delete_orphan_files_task + delete_orphan_files_task = delete_orphan_files_async_task.delay() - logger = logging.getLogger(__name__) - logger.info("Delete orphan files started") - - # The analytics task generates a db_storage_inconsistency_-