diff --git a/.gitignore b/.gitignore index b5b34f56a..4ac22b2f4 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ var/ var_*/ certs/ backups/ +logs/ src/static/output.css src/static/output.js diff --git a/docker-compose.yml b/docker-compose.yml index d45c6d013..4d89fe0bb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,6 +30,7 @@ services: - .:/app:delegated - /tmp/codalab-v2/django:/codalab_tmp - ./backups:/app/backups + - ./var/logs:/app/logs restart: unless-stopped ports: - 8000:8000 diff --git a/src/apps/analytics/migrations/0001_initial.py b/src/apps/analytics/migrations/0001_initial.py new file mode 100644 index 000000000..b5aefc6c0 --- /dev/null +++ b/src/apps/analytics/migrations/0001_initial.py @@ -0,0 +1,62 @@ +# Generated by Django 2.2.17 on 2023-09-14 13:19 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('competitions', '0035_auto_20230914_1319'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='AdminStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('backups_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), + ], + ), + migrations.CreateModel( + name='StorageUsageHistory', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('bucket_name', models.CharField(max_length=255)), + ('total_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('competitions_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('users_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('admin_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('orphaned_file_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), + ], + ), + migrations.CreateModel( + name='UserStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('submissions_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), + ], + ), + migrations.CreateModel( + name='CompetitionStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('competition', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='competitions.Competition')), + ], + ), + ] diff --git a/src/apps/analytics/migrations/__init__.py b/src/apps/analytics/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/apps/analytics/models.py b/src/apps/analytics/models.py new file mode 100644 index 000000000..9c1ade208 --- /dev/null +++ b/src/apps/analytics/models.py @@ -0,0 +1,55 @@ +from django.db import models +from django.conf import settings +from competitions.models import Competition + + +class StorageUsageHistory(models.Model): + bucket_name = models.CharField(max_length=255) + total_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) # in KiB up to ~ 930 TiB + competitions_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + users_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + admin_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + orphaned_file_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) + + +class CompetitionStorageDataPoint(models.Model): + competition = models.ForeignKey( + "competitions.competition", null=True, on_delete=models.SET_NULL + ) + datasets_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) + + +class UserStorageDataPoint(models.Model): + user = models.ForeignKey(settings.AUTH_USER_MODEL, null=True, on_delete=models.SET_NULL) + datasets_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + submissions_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) + + +class AdminStorageDataPoint(models.Model): + backups_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py new file mode 100644 index 000000000..7fd2b5bed --- /dev/null +++ b/src/apps/analytics/tasks.py @@ -0,0 +1,591 @@ +import time +import logging +from celery_config import app +from datetime import datetime, timezone, timedelta +from django.db.models import ( + Sum, + Q, + F, + Case, + Value, + When, + DecimalField, +) +from django.db.models.functions import TruncDay +from decimal import Decimal + +from competitions.models import Submission, SubmissionDetails +from datasets.models import Data +from utils.storage import BundleStorage +from analytics.models import ( + StorageUsageHistory, + CompetitionStorageDataPoint, + UserStorageDataPoint, + AdminStorageDataPoint, +) +from competitions.models import Competition +from datasets.models import Data +from profiles.models import User +from competitions.models import Submission, SubmissionDetails + +from utils.data import pretty_bytes + +logger = logging.getLogger() + + +@app.task(queue="site-worker", soft_time_limit=60 * 60 * 12) # 12 hours +def create_storage_analytics_snapshot(): + # Timer started ! + logger.info("Task create_storage_analytics_snapshot started") + starting_time = time.process_time() + + # Measure all files with unset size + for dataset in Data.objects.filter(Q(file_size__isnull=True) | Q(file_size__lt=0)): + try: + dataset.file_size = Decimal( + dataset.data_file.size / 1024 + ) # file_size is in KiB + except: + dataset.file_size = Decimal(-1) + finally: + dataset.save() + + for submission in Submission.objects.filter( + Q(prediction_result_file_size__isnull=True) + | Q(prediction_result_file_size__lt=0) + ): + try: + submission.prediction_result_file_size = Decimal( + submission.prediction_result.size / 1024 + ) # prediction_result_file_size is in KiB + except: + submission.prediction_result_file_size = Decimal(-1) + finally: + submission.save() + + for submission in Submission.objects.filter( + Q(scoring_result_file_size__isnull=True) | Q(scoring_result_file_size__lt=0) + ): + try: + submission.scoring_result_file_size = Decimal( + submission.scoring_result.size / 1024 + ) # scoring_result_file_size is in KiB + except: + submission.scoring_result_file_size = Decimal(-1) + finally: + submission.save() + + for submission in Submission.objects.filter( + Q(detailed_result_file_size__isnull=True) | Q(detailed_result_file_size__lt=0) + ): + try: + submission.detailed_result_file_size = Decimal( + submission.detailed_result.size / 1024 + ) # detailed_result_file_size is in KiB + except: + submission.detailed_result_file_size = Decimal(-1) + finally: + submission.save() + + for submissiondetails in SubmissionDetails.objects.filter( + Q(file_size__isnull=True) | Q(file_size__lt=0) + ): + try: + submissiondetails.file_size = Decimal( + submissiondetails.data_file.size / 1024 + ) # file_size is in KiB + except: + submissiondetails.file_size = Decimal(-1) + finally: + submissiondetails.save() + + # Evaluate the storage usage per category (competition, user or admin) and per day + current_datetime = datetime.now(timezone.utc) + max_history_days = 365 # days + + # Competitions + competitions_datasets = ( + Data.objects.filter(competition_id__isnull=False) + .annotate(day=TruncDay("created_when")) + .values("day", "competition_id") + .annotate( + size=Sum( + Case( + When(file_size__gt=0, then=F("file_size")), + default=Value(0), + output_field=DecimalField(), + ) + ) + ) + ) + + last_competition_storage_datapoint = CompetitionStorageDataPoint.objects.order_by( + "-at_date" + ).first() + last_competition_storage_datapoint_date = ( + last_competition_storage_datapoint.at_date + if last_competition_storage_datapoint + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + competition_storage_days_count = int( + (current_datetime - last_competition_storage_datapoint_date).days + ) + competition_storage_day_range = [ + last_competition_storage_datapoint_date + timedelta(day) + for day in range(1, competition_storage_days_count + 1) + ] + + for date in competition_storage_day_range: + for competition in Competition.objects.order_by("id"): + datasets_usage = competitions_datasets.filter( + Q(competition_id=competition.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + defaults = { + "datasets_total": datasets_usage or 0, + } + lookup_params = {"competition_id": competition.id, "at_date": date} + CompetitionStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) + + # Users + users_datasets = ( + Data.objects.filter(created_by_id__isnull=False) + .annotate(day=TruncDay("created_when")) + .values("day", "created_by_id") + .annotate( + size=Sum( + Case( + When(file_size__gt=0, then=F("file_size")), + default=Value(0), + output_field=DecimalField(), + ) + ) + ) + ) + + users_submissions = ( + Submission.objects.filter(owner_id__isnull=False) + .annotate(day=TruncDay("created_when")) + .values("day", "owner_id") + .annotate( + size=Sum( + Case( + When( + prediction_result_file_size__gt=0, + then=F("prediction_result_file_size"), + ), + default=Value(0), + output_field=DecimalField(), + ) + + Case( + When( + scoring_result_file_size__gt=0, + then=F("scoring_result_file_size"), + ), + default=Value(0), + output_field=DecimalField(), + ) + + Case( + When( + detailed_result_file_size__gt=0, + then=F("detailed_result_file_size"), + ), + default=Value(0), + output_field=DecimalField(), + ) + ) + ) + ) + + users_submissions_details = ( + SubmissionDetails.objects.filter(submission__owner_id__isnull=False) + .annotate(day=TruncDay("submission__created_when")) + .values("day", "submission__owner_id") + .annotate( + size=Sum( + Case( + When(file_size__gt=0, then=F("file_size")), + default=Value(0), + output_field=DecimalField(), + ) + ) + ) + ) + + last_user_storage_datapoint = UserStorageDataPoint.objects.order_by( + "-at_date" + ).first() + last_user_storage_datapoint_date = ( + last_user_storage_datapoint.at_date + if last_user_storage_datapoint + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + user_storage_days_count = int( + (current_datetime - last_user_storage_datapoint_date).days + ) + user_storage_day_range = [ + last_user_storage_datapoint_date + timedelta(day) + for day in range(1, user_storage_days_count + 1) + ] + + for date in user_storage_day_range: + for user in User.objects.order_by("id"): + datasets_usage = users_datasets.filter( + Q(created_by_id=user.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + submissions_usage = users_submissions.filter( + Q(owner_id=user.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + submissiondetails_usage = users_submissions_details.filter( + Q(submission__owner_id=user.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + defaults = { + "datasets_total": datasets_usage or 0, + "submissions_total": (submissions_usage or 0) + + (submissiondetails_usage or 0), + } + lookup_params = {"user_id": user.id, "at_date": date} + UserStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) + + # Admin + last_admin_storage_datapoint = AdminStorageDataPoint.objects.order_by( + "-at_date" + ).first() + last_admin_storage_datapoint_date = ( + last_admin_storage_datapoint.at_date + if last_admin_storage_datapoint + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + admin_storage_days_count = int( + (current_datetime - last_admin_storage_datapoint_date).days + ) + admin_storage_day_range = [ + last_admin_storage_datapoint_date + timedelta(day) + for day in range(1, admin_storage_days_count + 1) + ] + admin_storage_at_date = { + last_admin_storage_datapoint_date + timedelta(day): 0 + for day in range(1, admin_storage_days_count + 1) + } + + objects = BundleStorage.bucket.objects.filter(Prefix="backups") + for object in objects: + size = object.size + last_modified = object.last_modified + for date in admin_storage_day_range: + if last_modified < date: + admin_storage_at_date[date] += size + + for date in admin_storage_day_range: + defaults = {"backups_total": admin_storage_at_date[date] / 1024.0} + lookup_params = {"at_date": date} + AdminStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) + + # Check for database <-> storage inconsistency + inconsistencies = {"database": [], "storage": []} + + # Prepare some data + last_storage_usage_history_point = ( + StorageUsageHistory.objects.filter(bucket_name=BundleStorage.bucket.name) + .order_by("-at_date") + .first() + ) + last_storage_usage_history_date = ( + last_storage_usage_history_point.at_date + if last_storage_usage_history_point + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + storage_usage_history_days_count = int( + (current_datetime - last_storage_usage_history_date).days + ) + storage_usage_history_days = range(1, storage_usage_history_days_count + 1) + storage_usage_history_day_range = [ + last_storage_usage_history_date + timedelta(day) + for day in range(1, storage_usage_history_days_count + 1) + ] + + # Database + nb_missing_files = 0 + + # Datasets + for dataset in Data.objects.all().order_by("id"): + if ( + not dataset.data_file + or not dataset.data_file.name + or not BundleStorage.exists(dataset.data_file.name) + ): + inconsistencies["database"].append( + {"model": "dataset", "field": "data_file", "id": dataset.id} + ) + nb_missing_files += 1 + + # Submissions + for submission in Submission.objects.all().order_by("id"): + if ( + not submission.prediction_result + or not submission.prediction_result.name + or not BundleStorage.exists(submission.prediction_result.name) + ): + inconsistencies["database"].append( + { + "model": "submission", + "field": "prediction_result", + "id": submission.id, + } + ) + nb_missing_files += 1 + if ( + not submission.scoring_result + or not submission.scoring_result.name + or not BundleStorage.exists(submission.scoring_result.name) + ): + inconsistencies["database"].append( + {"model": "submission", "field": "scoring_result", "id": submission.id} + ) + nb_missing_files += 1 + if ( + submission.detailed_result + and submission.detailed_result.name + and not BundleStorage.exists(submission.detailed_result.name) + ): + inconsistencies["database"].append( + {"model": "submission", "field": "detailed_result", "id": submission.id} + ) + nb_missing_files += 1 + + # Submission details + for submissiondetails in SubmissionDetails.objects.all().order_by("id"): + if ( + not submissiondetails.data_file + or not submissiondetails.data_file.name + or not BundleStorage.exists(submissiondetails.data_file.name) + ): + inconsistencies["database"].append( + { + "model": "submissiondetails", + "field": "data_file", + "id": submissiondetails.id, + } + ) + nb_missing_files += 1 + + # Storage + nb_orphaned_files = 0 + orphaned_files_total_size = 0 # In bytes + orphaned_files_size_per_date = { + last_storage_usage_history_date + timedelta(day): 0 + for day in range(1, storage_usage_history_days_count + 1) + } + + # Dataset + db_dataset_paths = Data.objects.values_list("data_file", flat=True).distinct() + storage_dataset_paths = [ + obj.key for obj in BundleStorage.bucket.objects.filter(Prefix="dataset") + ] + orphaned_dataset_files = [ + x for x in storage_dataset_paths if x not in set(db_dataset_paths) + ] + nb_orphaned_files += len(orphaned_dataset_files) + for file in orphaned_dataset_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size + + # Detailed result + db_detailed_result_paths = Submission.objects.values_list( + "detailed_result", flat=True + ).distinct() + storage_detailed_result_paths = [ + obj.key for obj in BundleStorage.bucket.objects.filter(Prefix="detailed_result") + ] + orphaned_detailed_result_files = [ + x + for x in storage_detailed_result_paths + if x not in set(db_detailed_result_paths) + ] + nb_orphaned_files += len(orphaned_detailed_result_files) + for file in orphaned_detailed_result_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size + + # Prediction result + db_prediction_result_paths = Submission.objects.values_list( + "prediction_result", flat=True + ).distinct() + storage_prediction_result_paths = [ + obj.key + for obj in BundleStorage.bucket.objects.filter(Prefix="prediction_result") + ] + orphaned_prediction_result_files = [ + x + for x in storage_prediction_result_paths + if x not in set(db_prediction_result_paths) + ] + nb_orphaned_files += len(orphaned_prediction_result_files) + for file in orphaned_prediction_result_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size + + # Scoring result + db_scoring_result_paths = Submission.objects.values_list( + "scoring_result", flat=True + ).distinct() + storage_scoring_result_paths = [ + obj.key for obj in BundleStorage.bucket.objects.filter(Prefix="scoring_result") + ] + orphaned_scoring_result_files = [ + x for x in storage_scoring_result_paths if x not in set(db_scoring_result_paths) + ] + nb_orphaned_files += len(orphaned_scoring_result_files) + for file in orphaned_scoring_result_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size + + # Submission details + db_submission_details_paths = SubmissionDetails.objects.values_list( + "data_file", flat=True + ).distinct() + storage_submission_details_paths = [ + obj.key + for obj in BundleStorage.bucket.objects.filter(Prefix="submission_details") + ] + orphaned_submission_details_files = [ + x + for x in storage_submission_details_paths + if x not in set(db_submission_details_paths) + ] + nb_orphaned_files += len(orphaned_submission_details_files) + for file in orphaned_submission_details_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size + + # Log the results + log_file = ( + "/app/logs/" + + "db_storage_inconsistency_" + + current_datetime.strftime("%Y%m%d-%H%M%S") + + ".log" + ) + with open(log_file, "w") as file: + file.write("Database <---> Storage Inconsistency\n\n") + file.write(f"Bucket: {BundleStorage.bucket.name}\n") + file.write(f"Datetime: {current_datetime.isoformat()}\n\n") + file.write(f"Missing files: {nb_missing_files} files\n") + for missing_file in inconsistencies["database"]: + file.write( + f'{missing_file["model"]} of id={missing_file["id"]} is missing its {missing_file["field"]}\n' + ) + file.write( + f"\nOrphaned files: {nb_orphaned_files} files for a total of {pretty_bytes(orphaned_files_total_size)} ({orphaned_files_total_size}B)\n" + ) + for orphaned_file in inconsistencies["storage"]: + file.write( + f'{orphaned_file["path"]} {pretty_bytes(orphaned_file["size"])} ({orphaned_file["size"]}B)\n' + ) + + # Save the storage usage history points + for date in [ + last_storage_usage_history_date + timedelta(day) + for day in storage_usage_history_days + ]: + competitions_usage = ( + competitions_datasets.filter(day__lt=date).aggregate(total=Sum("size"))[ + "total" + ] + or 0 + ) + users_usage = ( + ( + users_datasets.filter(day__lt=date).aggregate(total=Sum("size"))[ + "total" + ] + or 0 + ) + + ( + users_submissions.filter(day__lt=date).aggregate(total=Sum("size"))[ + "total" + ] + or 0 + ) + + ( + users_submissions_details.filter(day__lt=date).aggregate( + total=Sum("size") + )["total"] + or 0 + ) + ) + admin_data_point = AdminStorageDataPoint.objects.filter(at_date=date).first() + admin_usage = (admin_data_point.backups_total or 0) if admin_data_point else 0 + orphaned_file_usage = Decimal(orphaned_files_size_per_date[date] / 1024) + total_usage = ( + users_usage + admin_usage + orphaned_file_usage + ) # competitions_usage is included inside users_usage + storage_usage_history_point = { + "bucket_name": BundleStorage.bucket.name, + "total_usage": total_usage, + "competitions_usage": competitions_usage, + "users_usage": users_usage, + "admin_usage": admin_usage, + "orphaned_file_usage": orphaned_file_usage, + "at_date": date, + } + StorageUsageHistory.objects.create(**storage_usage_history_point) + + # Stop the count! + elapsed_time = time.process_time() - starting_time + logger.info( + "Task create_storage_analytics_snapshot stoped. Duration = {:.3f} seconds".format( + elapsed_time + ) + ) + + +@app.task(queue="site-worker") # 12 hours +def reset_computed_storage_analytics(): + logger.info("Task reset_computed_storage_analytics started") + starting_time = time.process_time() + + # Reset the value of all computed file sizes so they will be re-computed again without any shifting on the next run of the storage analytics task + Submission.objects.all().update( + prediction_result_file_size=None, + scoring_result_file_size=None, + detailed_result_file_size=None, + ) + SubmissionDetails.objects.all().update(file_size=None) + Data.objects.all().update(file_size=None) + + elapsed_time = time.process_time() - starting_time + logger.info( + "Task reset_computed_storage_analytics stoped. Duration = {:.3f} seconds".format( + elapsed_time + ) + ) diff --git a/src/apps/api/urls.py b/src/apps/api/urls.py index 0bb521b3e..813a0f8fb 100644 --- a/src/apps/api/urls.py +++ b/src/apps/api/urls.py @@ -62,6 +62,11 @@ path('delete_unused_submissions/', quota.delete_unused_submissions, name="delete_unused_submissions"), path('delete_failed_submissions/', quota.delete_failed_submissions, name="delete_failed_submissions"), + # Analytics + path('analytics/storage_usage_history/', analytics.storage_usage_history, name='storage_usage_history'), + path('analytics/competitions_usage/', analytics.competitions_usage, name='competitions_usage'), + path('analytics/users_usage/', analytics.users_usage, name='users_usage'), + # API Docs re_path(r'docs(?P\.json|\.yaml)$', schema_view.without_ui(cache_timeout=0), name='schema-json'), path('docs/', schema_view.with_ui('swagger', cache_timeout=0), name='schema-swagger-ui'), diff --git a/src/apps/api/views/analytics.py b/src/apps/api/views/analytics.py index 084b042e7..47282c97b 100644 --- a/src/apps/api/views/analytics.py +++ b/src/apps/api/views/analytics.py @@ -1,12 +1,16 @@ from django.db.models import Count, F from django.contrib.auth import get_user_model from django.http import Http404 +from rest_framework import status +from rest_framework.exceptions import PermissionDenied from rest_framework.views import APIView from rest_framework.response import Response from rest_framework.renderers import JSONRenderer from rest_framework.filters import BaseFilterBackend +from rest_framework.decorators import api_view from rest_framework_csv import renderers as r from competitions.models import Competition, Submission +from analytics.models import StorageUsageHistory, CompetitionStorageDataPoint, UserStorageDataPoint from api.serializers.analytics import AnalyticsSerializer import datetime @@ -158,3 +162,111 @@ def get(self, request): 'end_date': end_date, 'time_unit': time_unit, }) + +@api_view(["GET"]) +def storage_usage_history(request): + """ + Gets the storage usage timeline between the 2 provided dates at the given resolution + """ + if not request.user.is_superuser: + raise PermissionDenied(detail="Admin only") + + storage_usage_history = {} + last_storage_usage_history_snapshot = StorageUsageHistory.objects.order_by("at_date").last() + if last_storage_usage_history_snapshot: + start_date = request.query_params.get("start_date", (datetime.datetime.today() - datetime.timedelta(weeks=4)).strftime("%Y-%m-%d")) + end_date = request.query_params.get("end_date", datetime.datetime.today().strftime("%Y-%m-%d")) + resolution = request.query_params.get("resolution", "day") + + query = StorageUsageHistory.objects.filter( + bucket_name=last_storage_usage_history_snapshot.bucket_name, + at_date__range=(start_date, end_date), + ).dates("at_date", resolution).values() + for su in query.order_by("-at_date"): + storage_usage_history[su['datefield'].isoformat()] = { + 'total_usage': su['total_usage'], + 'competitions_usage': su['competitions_usage'], + 'users_usage': su['users_usage'], + 'admin_usage': su['admin_usage'], + 'orphaned_file_usage': su['orphaned_file_usage'] + } + + return Response(storage_usage_history, status=status.HTTP_200_OK) + + +@api_view(["GET"]) +def competitions_usage(request): + """ + Gets the competitions usage between the 2 provided dates at the given resolution + """ + if not request.user.is_superuser: + raise PermissionDenied(detail="Admin only") + + competitions_usage = {} + last_competition_storage_snapshot = CompetitionStorageDataPoint.objects.order_by("at_date").last() + if last_competition_storage_snapshot: + start_date = request.query_params.get("start_date", (datetime.datetime.today() - datetime.timedelta(weeks=4)).strftime("%Y-%m-%d")) + end_date = request.query_params.get("end_date", datetime.datetime.today().strftime("%Y-%m-%d")) + resolution = request.query_params.get("resolution", "day") + + query = CompetitionStorageDataPoint.objects.filter( + at_date__range=(start_date, end_date), + ).dates("at_date", resolution).values( + 'id', + 'competition__id', + 'competition__title', + 'competition__created_by__username', + 'competition__created_by__email', + 'competition__created_when', + 'datasets_total', + 'datefield' + ) + for su in query.order_by("-datefield", "competition__id"): + competitions_usage.setdefault(su['datefield'].isoformat(), {})[su['competition__id']] = { + 'snapshot_id': su['id'], + 'title': su['competition__title'], + 'organizer': su['competition__created_by__username'] + " (" + su['competition__created_by__email'] + ")", + 'created_when': su['competition__created_when'], + 'datasets': su['datasets_total'], + } + + return Response(competitions_usage, status=status.HTTP_200_OK) + + +@api_view(["GET"]) +def users_usage(request): + """ + Gets the users usage between the 2 provided dates at the given resolution + """ + if not request.user.is_superuser: + raise PermissionDenied(detail="Admin only") + + users_usage = {} + last_user_storage_snapshot = UserStorageDataPoint.objects.order_by("at_date").last() + if last_user_storage_snapshot: + start_date = request.query_params.get("start_date", (datetime.datetime.today() - datetime.timedelta(weeks=4)).strftime("%Y-%m-%d")) + end_date = request.query_params.get("end_date", datetime.datetime.today().strftime("%Y-%m-%d")) + resolution = request.query_params.get("resolution", "day") + + query = UserStorageDataPoint.objects.filter( + at_date__range=(start_date, end_date), + ).dates("at_date", resolution).values( + 'id', + 'user__id', + 'user__username', + 'user__email', + 'user__date_joined', + 'datasets_total', + 'submissions_total', + 'datefield' + ) + for su in query.order_by("-datefield", "user__id"): + users_usage.setdefault(su['datefield'].isoformat(), {})[su['user__id']] = { + 'snapshot_id': su['id'], + 'name': su['user__username'] + " (" + su['user__email'] + ")", + 'date_joined': su['user__date_joined'], + 'datasets': su['datasets_total'], + 'submissions': su['submissions_total'], + } + + return Response(users_usage, status=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/apps/competitions/migrations/0035_auto_20230914_1319.py b/src/apps/competitions/migrations/0035_auto_20230914_1319.py new file mode 100644 index 000000000..60e6cd96a --- /dev/null +++ b/src/apps/competitions/migrations/0035_auto_20230914_1319.py @@ -0,0 +1,33 @@ +# Generated by Django 2.2.17 on 2023-09-14 13:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('competitions', '0034_auto_20230727_1147'), + ] + + operations = [ + migrations.AddField( + model_name='submission', + name='detailed_result_file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + migrations.AddField( + model_name='submission', + name='prediction_result_file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + migrations.AddField( + model_name='submission', + name='scoring_result_file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + migrations.AddField( + model_name='submissiondetails', + name='file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + ] diff --git a/src/apps/competitions/models.py b/src/apps/competitions/models.py index 65ff5c750..52d92ed5c 100644 --- a/src/apps/competitions/models.py +++ b/src/apps/competitions/models.py @@ -403,9 +403,22 @@ class SubmissionDetails(models.Model): ] name = models.CharField(max_length=50) data_file = models.FileField(upload_to=PathWrapper('submission_details'), storage=BundleStorage) + file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB submission = models.ForeignKey('Submission', on_delete=models.CASCADE, related_name='details') is_scoring = models.BooleanField(default=False) + def save(self, *args, **kwargs): + if self.data_file and (not self.file_size or self.file_size == -1): + try: + # save file size as KiB + # self.data_file.size returns bytes + self.file_size = self.data_file.size / 1024 + except TypeError: + # file returns a None size, can't divide None / 1024 + # -1 indicates an error + self.file_size = -1 + return super().save(*args, **kwargs) + class Submission(ChaHubSaveMixin, models.Model): NONE = "None" @@ -446,6 +459,10 @@ class Submission(ChaHubSaveMixin, models.Model): storage=BundleStorage) detailed_result = models.FileField(upload_to=PathWrapper('detailed_result'), null=True, blank=True, storage=BundleStorage) + + prediction_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB + scoring_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB + detailed_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB secret = models.UUIDField(default=uuid.uuid4) celery_task_id = models.UUIDField(null=True, blank=True) @@ -499,6 +516,22 @@ def save(self, ignore_submission_limit=False, **kwargs): if self.status == Submission.RUNNING and not self.started_when: self.started_when = now() + files_and_sizes_dict = { + 'prediction_result': 'prediction_result_file_size', + 'scoring_result': 'scoring_result_file_size', + 'detailed_result': 'detailed_result_file_size', + } + for file_path_attr, file_size_attr in files_and_sizes_dict.items(): + if getattr(self, file_path_attr) and (not getattr(self, file_size_attr) or getattr(self, file_size_attr) == -1): + try: + # save file size as KiB + # self.data_file.size returns bytes + setattr(self, file_size_attr, getattr(self, file_path_attr).size / 1024) + except TypeError: + # file returns a None size, can't divide None / 1024 + # -1 indicates an error + setattr(self, file_size_attr, -1) + super().save(**kwargs) def start(self, tasks=None): diff --git a/src/apps/datasets/models.py b/src/apps/datasets/models.py index 48d50f3f3..3ca6eb53d 100644 --- a/src/apps/datasets/models.py +++ b/src/apps/datasets/models.py @@ -52,7 +52,7 @@ class Data(ChaHubSaveMixin, models.Model): key = models.UUIDField(default=uuid.uuid4, blank=True, unique=True) is_public = models.BooleanField(default=False) upload_completed_successfully = models.BooleanField(default=False) - file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) + file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB # This is true if the Data model was created as part of unpacking a competition. Competition bundles themselves # are NOT marked True, since they are not created by unpacking! @@ -65,13 +65,15 @@ def get_download_url(self): return reverse('datasets:download', kwargs={'key': self.key}) def save(self, *args, **kwargs): - if not self.file_size and self.data_file: + if self.data_file and (not self.file_size or self.file_size == -1): try: - # save file size as kbs + # save file size as KiB + # self.data_file.size returns bytes self.file_size = self.data_file.size / 1024 except TypeError: # file returns a None size, can't divide None / 1024 - self.file_size = 0 + # -1 indicates an error + self.file_size = -1 if not self.name: self.name = f"{self.created_by.username} - {self.type}" return super().save(*args, **kwargs) diff --git a/src/settings/base.py b/src/settings/base.py index 62d3871e8..882fca340 100644 --- a/src/settings/base.py +++ b/src/settings/base.py @@ -1,6 +1,7 @@ import os import sys from datetime import timedelta +from celery.schedules import crontab import dj_database_url @@ -223,6 +224,14 @@ 'task': 'competitions.tasks.submission_status_cleanup', 'schedule': timedelta(seconds=3600) }, + 'create_storage_analytics_snapshot': { + 'task': 'analytics.tasks.create_storage_analytics_snapshot', + 'schedule': crontab(hour='2', minute='0', day_of_week='sun') # Every Sunday at 02:00 UTC time + }, + 'reset_computed_storage_analytics': { + 'task': 'analytics.tasks.reset_computed_storage_analytics', + 'schedule': crontab(hour='2', minute='0', day_of_month='1', month_of_year="*/3") # Every 3 month at 02:00 UTC on the 1st + }, } CELERY_TIMEZONE = 'UTC' CELERY_WORKER_PREFETCH_MULTIPLIER = 1 diff --git a/src/static/js/ours/client.js b/src/static/js/ours/client.js index 764fb109a..46f373804 100644 --- a/src/static/js/ours/client.js +++ b/src/static/js/ours/client.js @@ -308,6 +308,15 @@ CODALAB.api = { get_analytics: (filters) => { return CODALAB.api.request('GET', `${URLS.API}analytics/`, filters) }, + get_storage_usage_history: (filters) => { + return CODALAB.api.request('GET', `${URLS.API}analytics/storage_usage_history/`, filters); + }, + get_competitions_usage: (filters) => { + return CODALAB.api.request('GET', `${URLS.API}analytics/competitions_usage/`, filters); + }, + get_users_usage: (filters) => { + return CODALAB.api.request('GET', `${URLS.API}analytics/users_usage/`, filters); + }, /*--------------------------------------------------------------------- User Quota and Cleanup ---------------------------------------------------------------------*/ diff --git a/src/static/js/ours/utils.js b/src/static/js/ours/utils.js index c129b1d8c..5a0edc358 100644 --- a/src/static/js/ours/utils.js +++ b/src/static/js/ours/utils.js @@ -89,6 +89,17 @@ function pretty_date(date_string) { } } +function pretty_bytes(bytes, decimal_places=1, suffix="B") { + const units = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']; + for (const unit of units) { + if (Math.abs(bytes) < 1024.0 || unit == 'PiB') { + return bytes.toFixed(decimal_places) + unit + suffix; + } + bytes /= 1024.0; + } + return bytes.toFixed(decimal_places) + "Pi" + suffix; +} + /* ---------------------------------------------------------------------------- Form data helpers ----------------------------------------------------------------------------*/ diff --git a/src/static/riot/analytics/_competitions_usage.tag b/src/static/riot/analytics/_competitions_usage.tag new file mode 100644 index 000000000..b9568313e --- /dev/null +++ b/src/static/riot/analytics/_competitions_usage.tag @@ -0,0 +1,563 @@ + + + + +
+ +
+
+
+ + +
+
+
+ +
+ + + + + + + + + + + + + + + + + + +
CompetitionOrganizerCreation dateDatasets
{ competitionUsage.title }{ competitionUsage.organizer }{ formatDate(competitionUsage.created_when) }{ formatSize(competitionUsage.datasets) }
+ + + + +
\ No newline at end of file diff --git a/src/static/riot/analytics/_usage_history.tag b/src/static/riot/analytics/_usage_history.tag new file mode 100644 index 000000000..50535d8a6 --- /dev/null +++ b/src/static/riot/analytics/_usage_history.tag @@ -0,0 +1,181 @@ + + + +
+ +
+ + + + +
\ No newline at end of file diff --git a/src/static/riot/analytics/_users_usage.tag b/src/static/riot/analytics/_users_usage.tag new file mode 100644 index 000000000..5aff49e28 --- /dev/null +++ b/src/static/riot/analytics/_users_usage.tag @@ -0,0 +1,644 @@ + + + + +
+ +
+
+
+ + +
+
+
+
+ +
+
+ +
+
+ + + + + + + + + + + + + + + + + + + + +
UserJoined atDatasetsSubmissionsTotal
{ userUsage.name }{ formatDate(userUsage.date_joined) }{ formatSize(userUsage.datasets) }{ formatSize(userUsage.submissions) }{ formatSize(userUsage.datasets + userUsage.submissions) }
+ + + + +
\ No newline at end of file diff --git a/src/static/riot/analytics/analytics.tag b/src/static/riot/analytics/analytics.tag index a1b89f8a3..e6ecbeb26 100644 --- a/src/static/riot/analytics/analytics.tag +++ b/src/static/riot/analytics/analytics.tag @@ -1,14 +1,19 @@

Analytics

+ +

Date Range

-
-