From 8e833083f3c86daf30ada9fbd58b72269c0d9667 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Thu, 10 Aug 2023 11:21:57 -0400 Subject: [PATCH 01/12] add file size rows in 2 DB tables --- .../competitions/migrations/0035_file_size.py | 33 +++++++++++++++++++ src/apps/competitions/models.py | 5 +++ 2 files changed, 38 insertions(+) create mode 100644 src/apps/competitions/migrations/0035_file_size.py diff --git a/src/apps/competitions/migrations/0035_file_size.py b/src/apps/competitions/migrations/0035_file_size.py new file mode 100644 index 000000000..33d7d668a --- /dev/null +++ b/src/apps/competitions/migrations/0035_file_size.py @@ -0,0 +1,33 @@ +# Generated by Django 2.2.17 on 2023-08-10 15:17 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('competitions', '0034_auto_20230727_1147'), + ] + + operations = [ + migrations.AddField( + model_name='submission', + name='detailed_result_file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + migrations.AddField( + model_name='submission', + name='prediction_result_file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + migrations.AddField( + model_name='submission', + name='scoring_result_file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + migrations.AddField( + model_name='submissiondetails', + name='file_size', + field=models.DecimalField(blank=True, decimal_places=2, max_digits=10, null=True), + ), + ] diff --git a/src/apps/competitions/models.py b/src/apps/competitions/models.py index f139ce29b..f117cc61f 100644 --- a/src/apps/competitions/models.py +++ b/src/apps/competitions/models.py @@ -403,6 +403,7 @@ class SubmissionDetails(models.Model): ] name = models.CharField(max_length=50) data_file = models.FileField(upload_to=PathWrapper('submission_details'), storage=BundleStorage) + file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) submission = models.ForeignKey('Submission', on_delete=models.CASCADE, related_name='details') is_scoring = models.BooleanField(default=False) @@ -446,6 +447,10 @@ class Submission(ChaHubSaveMixin, models.Model): storage=BundleStorage) detailed_result = models.FileField(upload_to=PathWrapper('detailed_result'), null=True, blank=True, storage=BundleStorage) + + prediction_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) + scoring_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) + detailed_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) secret = models.UUIDField(default=uuid.uuid4) celery_task_id = models.UUIDField(null=True, blank=True) From 416a9b0ffc1ce03ca22c2ddf001121649b45c282 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Thu, 10 Aug 2023 11:49:14 -0400 Subject: [PATCH 02/12] add celery task definitions for storage analytics --- src/apps/analytics/tasks.py | 35 +++++++++++++++++++++++++++++++++++ src/settings/base.py | 9 +++++++++ 2 files changed, 44 insertions(+) create mode 100644 src/apps/analytics/tasks.py diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py new file mode 100644 index 000000000..6b831e607 --- /dev/null +++ b/src/apps/analytics/tasks.py @@ -0,0 +1,35 @@ +import time +import logging +from celery_config import app + +logger = logging.getLogger() + + +@app.task(queue="site-worker", soft_time_limit=60 * 60 * 12) # 12 hours +def create_storage_analytics_snapshot(): + logger.info("Task create_storage_analytics_snapshot started") + starting_time = time.process_time() + + # TODO Insert valuable code here + + elapsed_time = time.process_time() - starting_time + logger.info( + "Task create_storage_analytics_snapshot stoped. Duration = {:.3f} seconds".format( + elapsed_time + ) + ) + + +@app.task(queue="site-worker") # 12 hours +def reset_computed_storage_analytics(): + logger.info("Task reset_computed_storage_analytics started") + starting_time = time.process_time() + + # TODO Insert valuable code here + + elapsed_time = time.process_time() - starting_time + logger.info( + "Task reset_computed_storage_analytics stoped. Duration = {:.3f} seconds".format( + elapsed_time + ) + ) diff --git a/src/settings/base.py b/src/settings/base.py index 62d3871e8..882fca340 100644 --- a/src/settings/base.py +++ b/src/settings/base.py @@ -1,6 +1,7 @@ import os import sys from datetime import timedelta +from celery.schedules import crontab import dj_database_url @@ -223,6 +224,14 @@ 'task': 'competitions.tasks.submission_status_cleanup', 'schedule': timedelta(seconds=3600) }, + 'create_storage_analytics_snapshot': { + 'task': 'analytics.tasks.create_storage_analytics_snapshot', + 'schedule': crontab(hour='2', minute='0', day_of_week='sun') # Every Sunday at 02:00 UTC time + }, + 'reset_computed_storage_analytics': { + 'task': 'analytics.tasks.reset_computed_storage_analytics', + 'schedule': crontab(hour='2', minute='0', day_of_month='1', month_of_year="*/3") # Every 3 month at 02:00 UTC on the 1st + }, } CELERY_TIMEZONE = 'UTC' CELERY_WORKER_PREFETCH_MULTIPLIER = 1 From a72b5e3917170001436f802d4fbf1caddafbcc55 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Thu, 10 Aug 2023 12:06:27 -0400 Subject: [PATCH 03/12] reset computed storage analytics task done --- src/apps/analytics/tasks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py index 6b831e607..aa5836798 100644 --- a/src/apps/analytics/tasks.py +++ b/src/apps/analytics/tasks.py @@ -2,6 +2,9 @@ import logging from celery_config import app +from competitions.models import Submission, SubmissionDetails +from datasets.models import Data + logger = logging.getLogger() @@ -25,7 +28,14 @@ def reset_computed_storage_analytics(): logger.info("Task reset_computed_storage_analytics started") starting_time = time.process_time() - # TODO Insert valuable code here + # Reset the value of all computed file sizes so they will be re-computed again without any shifting on the next run of the storage analytics task + Submission.objects.all().update( + prediction_result_file_size=None, + scoring_result_file_size=None, + detailed_result_file_size=None, + ) + SubmissionDetails.objects.all().update(file_size=None) + Data.objects.all().update(file_size=None) elapsed_time = time.process_time() - starting_time logger.info( From b7993e41ae2be3687f2cc484234e312e66da7448 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Thu, 10 Aug 2023 15:38:20 -0400 Subject: [PATCH 04/12] handle file size error --- src/apps/datasets/models.py | 8 +++++--- src/static/riot/datasets/management.tag | 4 ++++ src/static/riot/submissions/resource_submissions.tag | 4 ++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/apps/datasets/models.py b/src/apps/datasets/models.py index 48d50f3f3..78cd1014c 100644 --- a/src/apps/datasets/models.py +++ b/src/apps/datasets/models.py @@ -65,13 +65,15 @@ def get_download_url(self): return reverse('datasets:download', kwargs={'key': self.key}) def save(self, *args, **kwargs): - if not self.file_size and self.data_file: + if self.data_file and (not self.file_size or self.file_size == -1): try: - # save file size as kbs + # save file size as KiB + # self.data_file.size returns bytes self.file_size = self.data_file.size / 1024 except TypeError: # file returns a None size, can't divide None / 1024 - self.file_size = 0 + # -1 indicates an error + self.file_size = -1 if not self.name: self.name = f"{self.created_by.username} - {self.type}" return super().save(*args, **kwargs) diff --git a/src/static/riot/datasets/management.tag b/src/static/riot/datasets/management.tag index 7b782472c..5d2ba198e 100644 --- a/src/static/riot/datasets/management.tag +++ b/src/static/riot/datasets/management.tag @@ -459,6 +459,10 @@ // return empty string if parsing fails return "" } + // a file_size of -1 indicated an error + if(n < 0) { + return "" + } // constant units to show with files size // file size is in KB, converting it to MB and GB const units = ['KB', 'MB', 'GB'] diff --git a/src/static/riot/submissions/resource_submissions.tag b/src/static/riot/submissions/resource_submissions.tag index bf6b31d01..b4c2fd1bb 100644 --- a/src/static/riot/submissions/resource_submissions.tag +++ b/src/static/riot/submissions/resource_submissions.tag @@ -430,6 +430,10 @@ // return empty string if parsing fails return "" } + // a file_size of -1 indicated an error + if(n < 0) { + return "" + } // constant units to show with files size // file size is in KB, converting it to MB and GB const units = ['KB', 'MB', 'GB'] From 6d5a4cb9ee2b8829e013151e9ebb6bd6e3856727 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Thu, 17 Aug 2023 17:15:09 -0400 Subject: [PATCH 05/12] create storage analytics snapshot task done --- src/apps/analytics/migrations/0001_initial.py | 61 +++++ src/apps/analytics/migrations/__init__.py | 0 src/apps/analytics/models.py | 37 +++ src/apps/analytics/tasks.py | 248 +++++++++++++++++- src/apps/competitions/models.py | 36 ++- src/apps/datasets/models.py | 2 +- 6 files changed, 378 insertions(+), 6 deletions(-) create mode 100644 src/apps/analytics/migrations/0001_initial.py create mode 100644 src/apps/analytics/migrations/__init__.py create mode 100644 src/apps/analytics/models.py diff --git a/src/apps/analytics/migrations/0001_initial.py b/src/apps/analytics/migrations/0001_initial.py new file mode 100644 index 000000000..604ebee8f --- /dev/null +++ b/src/apps/analytics/migrations/0001_initial.py @@ -0,0 +1,61 @@ +# Generated by Django 2.2.17 on 2023-08-17 19:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='AdminStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('backups_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('others_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ], + ), + migrations.CreateModel( + name='CompetitionStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('competition_id', models.PositiveIntegerField()), + ('title', models.CharField(max_length=255)), + ('created_by', models.CharField(max_length=255)), + ('created_when', models.DateTimeField()), + ('competition_type', models.CharField(choices=[('competition', 'competition'), ('benchmark', 'benchmark')], default='competition', max_length=128)), + ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ], + ), + migrations.CreateModel( + name='StorageUsageHistory', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('bucket_name', models.CharField(max_length=255)), + ('at_date', models.DateTimeField()), + ('total_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('competitions_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('users_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('admin_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ], + ), + migrations.CreateModel( + name='UserStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('user_id', models.PositiveIntegerField()), + ('email', models.CharField(max_length=255)), + ('username', models.CharField(max_length=255)), + ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('submissions_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ], + ), + ] diff --git a/src/apps/analytics/migrations/__init__.py b/src/apps/analytics/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/apps/analytics/models.py b/src/apps/analytics/models.py new file mode 100644 index 000000000..9188ca425 --- /dev/null +++ b/src/apps/analytics/models.py @@ -0,0 +1,37 @@ +from django.db import models +from competitions.models import Competition + + +class StorageUsageHistory(models.Model): + bucket_name = models.CharField(max_length=255) + at_date = models.DateTimeField() + total_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) # in KiB up to ~ 930 TiB + competitions_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + users_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + admin_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + +class CompetitionStorageDataPoint(models.Model): + competition_id = models.PositiveIntegerField() + title = models.CharField(max_length=255) + created_by = models.CharField(max_length=255) + created_when = models.DateTimeField() + competition_type = models.CharField(max_length=128, choices=Competition.COMPETITION_TYPE, default=Competition.COMPETITION) + datasets_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + at_date = models.DateTimeField() + + +class UserStorageDataPoint(models.Model): + user_id = models.PositiveIntegerField() + email = models.CharField(max_length=255) + username = models.CharField(max_length=255) + datasets_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + submissions_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + at_date = models.DateTimeField() + + +class AdminStorageDataPoint(models.Model): + backups_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + others_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + at_date = models.DateTimeField() diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py index aa5836798..e754798cb 100644 --- a/src/apps/analytics/tasks.py +++ b/src/apps/analytics/tasks.py @@ -1,9 +1,23 @@ import time import logging from celery_config import app +from datetime import datetime, timezone, timedelta +from django.db.models import Sum, Q, F +from decimal import Decimal from competitions.models import Submission, SubmissionDetails from datasets.models import Data +from utils.storage import BundleStorage +from analytics.models import ( + StorageUsageHistory, + CompetitionStorageDataPoint, + UserStorageDataPoint, + AdminStorageDataPoint, +) +from competitions.models import Competition +from datasets.models import Data +from profiles.models import User +from competitions.models import Submission, SubmissionDetails logger = logging.getLogger() @@ -13,7 +27,239 @@ def create_storage_analytics_snapshot(): logger.info("Task create_storage_analytics_snapshot started") starting_time = time.process_time() - # TODO Insert valuable code here + # Measure all files with unset size + for dataset in Data.objects.filter(Q(file_size__isnull=True) | Q(file_size__lt=0)): + try: + dataset.file_size = Decimal(dataset.data_file.size / 1024) + except: + dataset.file_size = Decimal(-1) + finally: + dataset.save() + + for submission in Submission.objects.filter( + Q(prediction_result_file_size__isnull=True) + | Q(prediction_result_file_size__lt=0) + ): + try: + submission.prediction_result_file_size = Decimal( + submission.prediction_result.size / 1024 + ) + except: + submission.prediction_result_file_size = Decimal(-1) + finally: + submission.save() + + for submission in Submission.objects.filter( + Q(scoring_result_file_size__isnull=True) | Q(scoring_result_file_size__lt=0) + ): + try: + submission.scoring_result_file_size = Decimal( + submission.scoring_result.size / 1024 + ) + except: + submission.scoring_result_file_size = Decimal(-1) + finally: + submission.save() + + for submission in Submission.objects.filter( + Q(detailed_result_file_size__isnull=True) | Q(detailed_result_file_size__lt=0) + ): + try: + submission.detailed_result_file_size = Decimal( + submission.detailed_result.size / 1024 + ) + except: + submission.detailed_result_file_size = Decimal(-1) + finally: + submission.save() + + for submissiondetails in SubmissionDetails.objects.filter( + Q(file_size__isnull=True) | Q(file_size__lt=0) + ): + try: + submissiondetails.file_size = Decimal( + submissiondetails.data_file.size / 1024 + ) + except: + submissiondetails.file_size = Decimal(-1) + finally: + submissiondetails.save() + + # Retrieve the last storage usage history point + bucket = BundleStorage.bucket # type: ignore + current_datetime = datetime.now(timezone.utc) + + # Competitions details + competitions = Competition.objects.all().reverse() + for competition in competitions: + datasets_usage = Data.objects.filter( + ( + Q(competition__id=competition.id) + & Q(type=Data.SUBMISSION) + & Q(was_created_by_competition=True) + ) + | ( + Q(competition__id=competition.id) + & Q(type=Data.SOLUTION) + & Q(was_created_by_competition=True) + ) + | ( + Q(competition__id=competition.id) + & ~Q(type=Data.SUBMISSION) + & ~Q(type=Data.SOLUTION) + ) + ).aggregate(total=Sum("file_size"))["total"] + + defaults = { + "title": competition.title, + "created_by": competition.created_by, + "created_when": competition.created_when, + "competition_type": competition.competition_type, + "datasets_total": datasets_usage or 0, + } + lookup_params = {"competition_id": competition.id, "at_date": current_datetime} + CompetitionStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) + + # User details + users = User.objects.exclude(id=-1).exclude(username="AnonymousUser") + for user in users: + datasets_usage = Data.objects.filter( + Q(created_by__id=user.id) + & ( + ( + Q(type=Data.SUBMISSION) + & ( + Q(competition__isnull=True) + | Q(was_created_by_competition=False) + ) + ) + | ( + Q(type=Data.SOLUTION) + & ( + Q(competition__isnull=True) + | Q(was_created_by_competition=False) + ) + ) + | ( + ~Q(type=Data.SUBMISSION) + & ~Q(type=Data.SOLUTION) + & Q(competition__isnull=True) + ) + ) + ).aggregate(total=Sum("file_size"))["total"] + submissions_usage = Submission.objects.filter(owner__id=user.id).aggregate( + total=Sum( + F("prediction_result_file_size") + + F("scoring_result_file_size") + + F("detailed_result_file_size") + ) + )["total"] + submissiondetails_usage = SubmissionDetails.objects.filter( + submission__owner=user.id + ).aggregate(total=Sum("file_size"))["total"] + + defaults = { + "email": user.email, + "username": user.username, + "datasets_total": datasets_usage or 0, + "submissions_total": (submissions_usage or 0) + + (submissiondetails_usage or 0), + } + lookup_params = {"user_id": user.id, "at_date": current_datetime} + UserStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) + + # Admin details + datasets_usage = Data.objects.filter( + ( + Q(type=Data.SUBMISSION) + & (Q(competition__isnull=True) | Q(was_created_by_competition=False)) + & Q(created_by__isnull=True) + ) + | ( + Q(type=Data.SOLUTION) + & (Q(competition__isnull=True) | Q(was_created_by_competition=False)) + & Q(created_by__isnull=True) + ) + | ( + ~Q(type=Data.SUBMISSION) + & ~Q(type=Data.SOLUTION) + & Q(competition__isnull=True) + & Q(created_by__isnull=True) + ) + ).aggregate(total=Sum("file_size"))["total"] + backups_usage = 0 + objects = bucket.objects.filter(Prefix="backups") + for object in objects: + backups_usage += object.size + + defaults = {"backups_total": backups_usage, "others_total": datasets_usage or 0} + lookup_params = {"at_date": current_datetime} + AdminStorageDataPoint.objects.update_or_create(defaults=defaults, **lookup_params) + + # Save the storage usage history points + last_storage_usage_history_point = ( + StorageUsageHistory.objects.filter(bucket_name=bucket.name) + .order_by("-at_date") + .first() + ) + last_storage_usage_history_date = ( + last_storage_usage_history_point.at_date + if last_storage_usage_history_point + else current_datetime - timedelta(days=1000) + ).replace(hour=0, minute=0, second=0, microsecond=0) + days_count = int((current_datetime - last_storage_usage_history_date).days) + days = range(1, days_count + 1) + usage_at_date = { + last_storage_usage_history_date + + timedelta(day): { + "total": Decimal(0), + "competitions": Decimal(0), + "users": Decimal(0), + "admin": Decimal(0), + } + for day in days + } + + for competition_data_point in CompetitionStorageDataPoint.objects.all().order_by( + "at_date" + ): + for date in usage_at_date: + if competition_data_point.at_date <= date: + if competition_data_point.datasets_total: + usage_at_date[date][ + "competitions" + ] += competition_data_point.datasets_total + + for user_data_point in UserStorageDataPoint.objects.all().order_by("at_date"): + for date in usage_at_date: + if user_data_point.at_date <= date: + if user_data_point.datasets_total: + usage_at_date[date]["users"] += user_data_point.datasets_total + if user_data_point.submissions_total: + usage_at_date[date]["users"] += user_data_point.submissions_total + + for admin_data_point in AdminStorageDataPoint.objects.all().order_by("at_date"): + for date in usage_at_date: + if admin_data_point.at_date <= date: + if admin_data_point.backups_total: + usage_at_date[date]["admin"] += admin_data_point.backups_total + if admin_data_point.others_total: + usage_at_date[date]["admin"] += admin_data_point.others_total + + for date, usages in usage_at_date.items(): + storage_usage_history_point = { + "bucket_name": bucket.name, + "at_date": date, + "total_usage": usages["total"], + "competitions_usage": usages["competitions"], + "users_usage": usages["users"], + "admin_usage": usages["admin"], + } + StorageUsageHistory.objects.create(**storage_usage_history_point) elapsed_time = time.process_time() - starting_time logger.info( diff --git a/src/apps/competitions/models.py b/src/apps/competitions/models.py index f117cc61f..a2f1cb112 100644 --- a/src/apps/competitions/models.py +++ b/src/apps/competitions/models.py @@ -403,10 +403,22 @@ class SubmissionDetails(models.Model): ] name = models.CharField(max_length=50) data_file = models.FileField(upload_to=PathWrapper('submission_details'), storage=BundleStorage) - file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) + file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB submission = models.ForeignKey('Submission', on_delete=models.CASCADE, related_name='details') is_scoring = models.BooleanField(default=False) + def save(self, *args, **kwargs): + if self.data_file and (not self.file_size or self.file_size == -1): + try: + # save file size as KiB + # self.data_file.size returns bytes + self.file_size = self.data_file.size / 1024 + except TypeError: + # file returns a None size, can't divide None / 1024 + # -1 indicates an error + self.file_size = -1 + return super().save(*args, **kwargs) + class Submission(ChaHubSaveMixin, models.Model): NONE = "None" @@ -448,9 +460,9 @@ class Submission(ChaHubSaveMixin, models.Model): detailed_result = models.FileField(upload_to=PathWrapper('detailed_result'), null=True, blank=True, storage=BundleStorage) - prediction_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) - scoring_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) - detailed_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) + prediction_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB + scoring_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB + detailed_result_file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB secret = models.UUIDField(default=uuid.uuid4) celery_task_id = models.UUIDField(null=True, blank=True) @@ -504,6 +516,22 @@ def save(self, ignore_submission_limit=False, **kwargs): if self.status == Submission.RUNNING and not self.started_when: self.started_when = now() + files_and_sizes_dict = { + 'prediction_result': 'prediction_result_file_size', + 'scoring_result': 'scoring_result_file_size', + 'detailed_result': 'detailed_result_file_size', + } + for file_path_attr, file_size_attr in files_and_sizes_dict.items(): + if getattr(self, file_path_attr) and (not getattr(self, file_size_attr) or getattr(self, file_size_attr) == -1): + try: + # save file size as KiB + # self.data_file.size returns bytes + setattr(self, file_size_attr, getattr(self, file_path_attr).size / 1024) + except TypeError: + # file returns a None size, can't divide None / 1024 + # -1 indicates an error + setattr(self, file_size_attr, -1) + super().save(**kwargs) def start(self, tasks=None): diff --git a/src/apps/datasets/models.py b/src/apps/datasets/models.py index 78cd1014c..3ca6eb53d 100644 --- a/src/apps/datasets/models.py +++ b/src/apps/datasets/models.py @@ -52,7 +52,7 @@ class Data(ChaHubSaveMixin, models.Model): key = models.UUIDField(default=uuid.uuid4, blank=True, unique=True) is_public = models.BooleanField(default=False) upload_completed_successfully = models.BooleanField(default=False) - file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) + file_size = models.DecimalField(max_digits=10, decimal_places=2, null=True, blank=True) # in KiB # This is true if the Data model was created as part of unpacking a competition. Competition bundles themselves # are NOT marked True, since they are not created by unpacking! From 63d71a727bfa6d5fa810b9721b68b3272a51cde4 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Fri, 18 Aug 2023 16:08:20 -0400 Subject: [PATCH 06/12] analyse and log database-storage inconsistency --- .gitignore | 1 + docker-compose.yml | 1 + src/apps/analytics/tasks.py | 102 ++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) diff --git a/.gitignore b/.gitignore index b5b34f56a..4ac22b2f4 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ var/ var_*/ certs/ backups/ +logs/ src/static/output.css src/static/output.js diff --git a/docker-compose.yml b/docker-compose.yml index d45c6d013..4d89fe0bb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,6 +30,7 @@ services: - .:/app:delegated - /tmp/codalab-v2/django:/codalab_tmp - ./backups:/app/backups + - ./var/logs:/app/logs restart: unless-stopped ports: - 8000:8000 diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py index e754798cb..0abbb8119 100644 --- a/src/apps/analytics/tasks.py +++ b/src/apps/analytics/tasks.py @@ -24,6 +24,7 @@ @app.task(queue="site-worker", soft_time_limit=60 * 60 * 12) # 12 hours def create_storage_analytics_snapshot(): + # Timing started ! logger.info("Task create_storage_analytics_snapshot started") starting_time = time.process_time() @@ -261,6 +262,107 @@ def create_storage_analytics_snapshot(): } StorageUsageHistory.objects.create(**storage_usage_history_point) + # Check for database <-> storage inconsistency + inconsistencies = {"database": [], "storage": []} + + # Database + for dataset in Data.objects.all().order_by("id"): + if ( + not dataset.data_file + or not dataset.data_file.name + or not BundleStorage.exists(dataset.data_file.name) + ): + inconsistencies["database"].append( + {"model": "dataset", "field": "data_file", "id": dataset.id} + ) + for submission in Submission.objects.all().order_by("id"): + if ( + not submission.prediction_result + or not submission.prediction_result.name + or not BundleStorage.exists(submission.prediction_result.name) + ): + inconsistencies["database"].append( + { + "model": "submission", + "field": "prediction_result", + "id": submission.id, + } + ) + if ( + not submission.scoring_result + or not submission.scoring_result.name + or not BundleStorage.exists(submission.scoring_result.name) + ): + inconsistencies["database"].append( + {"model": "submission", "field": "scoring_result", "id": submission.id} + ) + if ( + not submission.detailed_result + or not submission.detailed_result.name + or not BundleStorage.exists(submission.detailed_result.name) + ): + inconsistencies["database"].append( + {"model": "submission", "field": "detailed_result", "id": submission.id} + ) + for submissiondetails in SubmissionDetails.objects.all().order_by("id"): + if ( + not submissiondetails.data_file + or not submissiondetails.data_file.name + or not BundleStorage.exists(submissiondetails.data_file.name) + ): + inconsistencies["database"].append( + { + "model": "submissiondetails", + "field": "data_file", + "id": submissiondetails.id, + } + ) + + # Storage + # Dataset + db_dataset_paths = Data.objects.values_list('data_file', flat=True) + storage_dataset_paths = [obj.key for obj in bucket.objects.filter(Prefix='dataset')] + orphaned_dataset_files = set(storage_dataset_paths) - set(db_dataset_paths) + inconsistencies["storage"] += list(orphaned_dataset_files) + + # Detailed result + db_detailed_result_paths = Submission.objects.values_list('detailed_result', flat=True) + storage_detailed_result_paths = [obj.key for obj in bucket.objects.filter(Prefix='detailed_result')] + orphaned_detailed_result_files = set(storage_detailed_result_paths) - set(db_detailed_result_paths) + inconsistencies["storage"] += list(orphaned_detailed_result_files) + + # Prediction result + db_prediction_result_paths = Submission.objects.values_list('prediction_result', flat=True) + storage_prediction_result_paths = [obj.key for obj in bucket.objects.filter(Prefix='prediction_result')] + orphaned_prediction_result_files = set(storage_prediction_result_paths) - set(db_prediction_result_paths) + inconsistencies["storage"] += list(orphaned_prediction_result_files) + + # Scoring result + db_scoring_result_paths = Submission.objects.values_list('scoring_result', flat=True) + storage_scoring_result_paths = [obj.key for obj in bucket.objects.filter(Prefix='scoring_result')] + orphaned_scoring_result_files = set(storage_scoring_result_paths) - set(db_scoring_result_paths) + inconsistencies["storage"] += list(orphaned_scoring_result_files) + + # Submission details + db_submission_details_paths = SubmissionDetails.objects.values_list('data_file', flat=True) + storage_submission_details_paths = [obj.key for obj in bucket.objects.filter(Prefix='submission_details')] + orphaned_submission_details_files = set(storage_submission_details_paths) - set(db_submission_details_paths) + inconsistencies["storage"] += list(orphaned_submission_details_files) + + # Log the results + log_file = "/app/logs/" + "db_storage_inconsistency_" + current_datetime.strftime("%Y%m%d-%H%M%S") + ".log" + with open(log_file, 'w') as file: + file.write('Database <---> Storage Inconsistency\n\n') + file.write(f'Bucket: {bucket.name}\n') + file.write(f'Datetime: {current_datetime.isoformat()}\n\n') + file.write(f'Missing files:\n') + for missing_file in inconsistencies["database"]: + file.write(f'{missing_file["model"]} of id={missing_file["id"]} is missing its {missing_file["field"]}\n') + file.write(f'\nOrphaned files:\n') + for orphaned_file in inconsistencies["storage"]: + file.write(f'{orphaned_file}\n') + + # Stop the count! elapsed_time = time.process_time() - starting_time logger.info( "Task create_storage_analytics_snapshot stoped. Duration = {:.3f} seconds".format( From d9a3be25d960d24edbd4785892335f74121a7780 Mon Sep 17 00:00:00 2001 From: OhMaley Date: Fri, 22 Sep 2023 16:40:45 -0400 Subject: [PATCH 07/12] work on the front-end --- src/apps/analytics/migrations/0001_initial.py | 39 +- src/apps/analytics/models.py | 52 +- src/apps/analytics/tasks.py | 586 ++++++++++++------ src/apps/api/urls.py | 4 + src/apps/api/views/analytics.py | 73 +++ ...ile_size.py => 0035_auto_20230914_1319.py} | 2 +- src/settings/base.py | 6 +- src/static/js/ours/client.js | 6 + src/static/js/ours/utils.js | 11 + src/static/riot/analytics/analytics.tag | 559 ++++++++++++++--- src/utils/data.py | 8 + 11 files changed, 1023 insertions(+), 323 deletions(-) rename src/apps/competitions/migrations/{0035_file_size.py => 0035_auto_20230914_1319.py} (95%) diff --git a/src/apps/analytics/migrations/0001_initial.py b/src/apps/analytics/migrations/0001_initial.py index 604ebee8f..b5aefc6c0 100644 --- a/src/apps/analytics/migrations/0001_initial.py +++ b/src/apps/analytics/migrations/0001_initial.py @@ -1,6 +1,8 @@ -# Generated by Django 2.2.17 on 2023-08-17 19:43 +# Generated by Django 2.2.17 on 2023-09-14 13:19 +from django.conf import settings from django.db import migrations, models +import django.db.models.deletion class Migration(migrations.Migration): @@ -8,6 +10,8 @@ class Migration(migrations.Migration): initial = True dependencies = [ + ('competitions', '0035_auto_20230914_1319'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), ] operations = [ @@ -16,21 +20,8 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('backups_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), - ('others_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), - ('at_date', models.DateTimeField()), - ], - ), - migrations.CreateModel( - name='CompetitionStorageDataPoint', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('competition_id', models.PositiveIntegerField()), - ('title', models.CharField(max_length=255)), - ('created_by', models.CharField(max_length=255)), - ('created_when', models.DateTimeField()), - ('competition_type', models.CharField(choices=[('competition', 'competition'), ('benchmark', 'benchmark')], default='competition', max_length=128)), - ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), ], ), migrations.CreateModel( @@ -38,11 +29,12 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('bucket_name', models.CharField(max_length=255)), - ('at_date', models.DateTimeField()), ('total_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), ('competitions_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), ('users_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), ('admin_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('orphaned_file_usage', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), ('created_at', models.DateTimeField(auto_now_add=True)), ], ), @@ -50,12 +42,21 @@ class Migration(migrations.Migration): name='UserStorageDataPoint', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('user_id', models.PositiveIntegerField()), - ('email', models.CharField(max_length=255)), - ('username', models.CharField(max_length=255)), ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), ('submissions_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), + ], + ), + migrations.CreateModel( + name='CompetitionStorageDataPoint', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('datasets_total', models.DecimalField(blank=True, decimal_places=2, max_digits=14, null=True)), + ('at_date', models.DateTimeField()), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('competition', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='competitions.Competition')), ], ), ] diff --git a/src/apps/analytics/models.py b/src/apps/analytics/models.py index 9188ca425..9c1ade208 100644 --- a/src/apps/analytics/models.py +++ b/src/apps/analytics/models.py @@ -1,37 +1,55 @@ from django.db import models +from django.conf import settings from competitions.models import Competition class StorageUsageHistory(models.Model): bucket_name = models.CharField(max_length=255) + total_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) # in KiB up to ~ 930 TiB + competitions_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + users_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + admin_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + orphaned_file_usage = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) at_date = models.DateTimeField() - total_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) # in KiB up to ~ 930 TiB - competitions_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) - users_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) - admin_usage = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) created_at = models.DateTimeField(auto_now_add=True) class CompetitionStorageDataPoint(models.Model): - competition_id = models.PositiveIntegerField() - title = models.CharField(max_length=255) - created_by = models.CharField(max_length=255) - created_when = models.DateTimeField() - competition_type = models.CharField(max_length=128, choices=Competition.COMPETITION_TYPE, default=Competition.COMPETITION) - datasets_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + competition = models.ForeignKey( + "competitions.competition", null=True, on_delete=models.SET_NULL + ) + datasets_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) class UserStorageDataPoint(models.Model): - user_id = models.PositiveIntegerField() - email = models.CharField(max_length=255) - username = models.CharField(max_length=255) - datasets_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) - submissions_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + user = models.ForeignKey(settings.AUTH_USER_MODEL, null=True, on_delete=models.SET_NULL) + datasets_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) + submissions_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) class AdminStorageDataPoint(models.Model): - backups_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) - others_total = models.DecimalField(max_digits=14, decimal_places=2, null=True, blank=True) + backups_total = models.DecimalField( + max_digits=14, decimal_places=2, null=True, blank=True + ) at_date = models.DateTimeField() + created_at = models.DateTimeField(auto_now_add=True) diff --git a/src/apps/analytics/tasks.py b/src/apps/analytics/tasks.py index 0abbb8119..5183dea22 100644 --- a/src/apps/analytics/tasks.py +++ b/src/apps/analytics/tasks.py @@ -2,7 +2,16 @@ import logging from celery_config import app from datetime import datetime, timezone, timedelta -from django.db.models import Sum, Q, F +from django.db.models import ( + Sum, + Q, + F, + Case, + Value, + When, + DecimalField, +) +from django.db.models.functions import TruncDay from decimal import Decimal from competitions.models import Submission, SubmissionDetails @@ -19,19 +28,23 @@ from profiles.models import User from competitions.models import Submission, SubmissionDetails +from utils.data import pretty_bytes + logger = logging.getLogger() @app.task(queue="site-worker", soft_time_limit=60 * 60 * 12) # 12 hours def create_storage_analytics_snapshot(): - # Timing started ! + # Timer started ! logger.info("Task create_storage_analytics_snapshot started") starting_time = time.process_time() # Measure all files with unset size for dataset in Data.objects.filter(Q(file_size__isnull=True) | Q(file_size__lt=0)): try: - dataset.file_size = Decimal(dataset.data_file.size / 1024) + dataset.file_size = Decimal( + dataset.data_file.size / 1024 + ) # file_size is in KiB except: dataset.file_size = Decimal(-1) finally: @@ -44,7 +57,7 @@ def create_storage_analytics_snapshot(): try: submission.prediction_result_file_size = Decimal( submission.prediction_result.size / 1024 - ) + ) # prediction_result_file_size is in KiB except: submission.prediction_result_file_size = Decimal(-1) finally: @@ -56,7 +69,7 @@ def create_storage_analytics_snapshot(): try: submission.scoring_result_file_size = Decimal( submission.scoring_result.size / 1024 - ) + ) # scoring_result_file_size is in KiB except: submission.scoring_result_file_size = Decimal(-1) finally: @@ -68,7 +81,7 @@ def create_storage_analytics_snapshot(): try: submission.detailed_result_file_size = Decimal( submission.detailed_result.size / 1024 - ) + ) # detailed_result_file_size is in KiB except: submission.detailed_result_file_size = Decimal(-1) finally: @@ -80,192 +93,226 @@ def create_storage_analytics_snapshot(): try: submissiondetails.file_size = Decimal( submissiondetails.data_file.size / 1024 - ) + ) # file_size is in KiB except: submissiondetails.file_size = Decimal(-1) finally: submissiondetails.save() - # Retrieve the last storage usage history point - bucket = BundleStorage.bucket # type: ignore + # Evaluate the storage usage per category (competition, user or admin) and per day current_datetime = datetime.now(timezone.utc) - - # Competitions details - competitions = Competition.objects.all().reverse() - for competition in competitions: - datasets_usage = Data.objects.filter( - ( - Q(competition__id=competition.id) - & Q(type=Data.SUBMISSION) - & Q(was_created_by_competition=True) + max_history_days = 365 # days + + # Competitions + competitions_datasets = ( + Data.objects.filter(competition_id__isnull=False) + .annotate(day=TruncDay("created_when")) + .values("day", "competition_id") + .annotate( + size=Sum( + Case( + When(file_size__gt=0, then=F("file_size")), + default=Value(0), + output_field=DecimalField(), + ) ) - | ( - Q(competition__id=competition.id) - & Q(type=Data.SOLUTION) - & Q(was_created_by_competition=True) + ) + ) + + last_competition_storage_datapoint = CompetitionStorageDataPoint.objects.order_by( + "-at_date" + ).first() + last_competition_storage_datapoint_date = ( + last_competition_storage_datapoint.at_date + if last_competition_storage_datapoint + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + competition_storage_days_count = int( + (current_datetime - last_competition_storage_datapoint_date).days + ) + competition_storage_day_range = [ + last_competition_storage_datapoint_date + timedelta(day) + for day in range(1, competition_storage_days_count + 1) + ] + + for date in competition_storage_day_range: + for competition in Competition.objects.order_by("id"): + datasets_usage = competitions_datasets.filter( + Q(competition_id=competition.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + defaults = { + "datasets_total": datasets_usage or 0, + } + lookup_params = {"competition_id": competition.id, "at_date": date} + CompetitionStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params ) - | ( - Q(competition__id=competition.id) - & ~Q(type=Data.SUBMISSION) - & ~Q(type=Data.SOLUTION) + + # Users + users_datasets = ( + Data.objects.filter(created_by_id__isnull=False) + .annotate(day=TruncDay("created_when")) + .values("day", "created_by_id") + .annotate( + size=Sum( + Case( + When(file_size__gt=0, then=F("file_size")), + default=Value(0), + output_field=DecimalField(), + ) ) - ).aggregate(total=Sum("file_size"))["total"] - - defaults = { - "title": competition.title, - "created_by": competition.created_by, - "created_when": competition.created_when, - "competition_type": competition.competition_type, - "datasets_total": datasets_usage or 0, - } - lookup_params = {"competition_id": competition.id, "at_date": current_datetime} - CompetitionStorageDataPoint.objects.update_or_create( - defaults=defaults, **lookup_params ) + ) - # User details - users = User.objects.exclude(id=-1).exclude(username="AnonymousUser") - for user in users: - datasets_usage = Data.objects.filter( - Q(created_by__id=user.id) - & ( - ( - Q(type=Data.SUBMISSION) - & ( - Q(competition__isnull=True) - | Q(was_created_by_competition=False) - ) + users_submissions = ( + Submission.objects.filter(owner_id__isnull=False) + .annotate(day=TruncDay("created_when")) + .values("day", "owner_id") + .annotate( + size=Sum( + Case( + When( + prediction_result_file_size__gt=0, + then=F("prediction_result_file_size"), + ), + default=Value(0), + output_field=DecimalField(), ) - | ( - Q(type=Data.SOLUTION) - & ( - Q(competition__isnull=True) - | Q(was_created_by_competition=False) - ) + + Case( + When( + scoring_result_file_size__gt=0, + then=F("scoring_result_file_size"), + ), + default=Value(0), + output_field=DecimalField(), ) - | ( - ~Q(type=Data.SUBMISSION) - & ~Q(type=Data.SOLUTION) - & Q(competition__isnull=True) + + Case( + When( + detailed_result_file_size__gt=0, + then=F("detailed_result_file_size"), + ), + default=Value(0), + output_field=DecimalField(), ) ) - ).aggregate(total=Sum("file_size"))["total"] - submissions_usage = Submission.objects.filter(owner__id=user.id).aggregate( - total=Sum( - F("prediction_result_file_size") - + F("scoring_result_file_size") - + F("detailed_result_file_size") - ) - )["total"] - submissiondetails_usage = SubmissionDetails.objects.filter( - submission__owner=user.id - ).aggregate(total=Sum("file_size"))["total"] - - defaults = { - "email": user.email, - "username": user.username, - "datasets_total": datasets_usage or 0, - "submissions_total": (submissions_usage or 0) - + (submissiondetails_usage or 0), - } - lookup_params = {"user_id": user.id, "at_date": current_datetime} - UserStorageDataPoint.objects.update_or_create( - defaults=defaults, **lookup_params ) + ) - # Admin details - datasets_usage = Data.objects.filter( - ( - Q(type=Data.SUBMISSION) - & (Q(competition__isnull=True) | Q(was_created_by_competition=False)) - & Q(created_by__isnull=True) - ) - | ( - Q(type=Data.SOLUTION) - & (Q(competition__isnull=True) | Q(was_created_by_competition=False)) - & Q(created_by__isnull=True) - ) - | ( - ~Q(type=Data.SUBMISSION) - & ~Q(type=Data.SOLUTION) - & Q(competition__isnull=True) - & Q(created_by__isnull=True) + users_submissions_details = ( + SubmissionDetails.objects.filter(submission__owner_id__isnull=False) + .annotate(day=TruncDay("submission__created_when")) + .values("day", "submission__owner_id") + .annotate( + size=Sum( + Case( + When(file_size__gt=0, then=F("file_size")), + default=Value(0), + output_field=DecimalField(), + ) + ) ) - ).aggregate(total=Sum("file_size"))["total"] - backups_usage = 0 - objects = bucket.objects.filter(Prefix="backups") + ) + + last_user_storage_datapoint = UserStorageDataPoint.objects.order_by( + "-at_date" + ).first() + last_user_storage_datapoint_date = ( + last_user_storage_datapoint.at_date + if last_user_storage_datapoint + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + user_storage_days_count = int( + (current_datetime - last_user_storage_datapoint_date).days + ) + user_storage_day_range = [ + last_user_storage_datapoint_date + timedelta(day) + for day in range(1, user_storage_days_count + 1) + ] + + for date in user_storage_day_range: + for user in User.objects.order_by("id"): + datasets_usage = users_datasets.filter( + Q(created_by_id=user.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + submissions_usage = users_submissions.filter( + Q(owner_id=user.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + submissiondetails_usage = users_submissions_details.filter( + Q(submission__owner_id=user.id) & Q(day__lt=date) + ).aggregate(total=Sum("size"))["total"] + defaults = { + "datasets_total": datasets_usage or 0, + "submissions_total": (submissions_usage or 0) + + (submissiondetails_usage or 0), + } + lookup_params = {"user_id": user.id, "at_date": date} + UserStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) + + # Admin + last_admin_storage_datapoint = AdminStorageDataPoint.objects.order_by( + "-at_date" + ).first() + last_admin_storage_datapoint_date = ( + last_admin_storage_datapoint.at_date + if last_admin_storage_datapoint + else current_datetime - timedelta(days=max_history_days) + ).replace(hour=0, minute=0, second=0, microsecond=0) + admin_storage_days_count = int( + (current_datetime - last_admin_storage_datapoint_date).days + ) + admin_storage_day_range = [ + last_admin_storage_datapoint_date + timedelta(day) + for day in range(1, admin_storage_days_count + 1) + ] + admin_storage_at_date = { + last_admin_storage_datapoint_date + timedelta(day): 0 + for day in range(1, admin_storage_days_count + 1) + } + + objects = BundleStorage.bucket.objects.filter(Prefix="backups") for object in objects: - backups_usage += object.size + size = object.size + last_modified = object.last_modified + for date in admin_storage_day_range: + if last_modified < date: + admin_storage_at_date[date] += size + + for date in admin_storage_day_range: + defaults = {"backups_total": admin_storage_at_date[date]} + lookup_params = {"at_date": date} + AdminStorageDataPoint.objects.update_or_create( + defaults=defaults, **lookup_params + ) - defaults = {"backups_total": backups_usage, "others_total": datasets_usage or 0} - lookup_params = {"at_date": current_datetime} - AdminStorageDataPoint.objects.update_or_create(defaults=defaults, **lookup_params) + # Check for database <-> storage inconsistency + inconsistencies = {"database": [], "storage": []} - # Save the storage usage history points + # Prepare some data last_storage_usage_history_point = ( - StorageUsageHistory.objects.filter(bucket_name=bucket.name) + StorageUsageHistory.objects.filter(bucket_name=BundleStorage.bucket.name) .order_by("-at_date") .first() ) last_storage_usage_history_date = ( last_storage_usage_history_point.at_date if last_storage_usage_history_point - else current_datetime - timedelta(days=1000) + else current_datetime - timedelta(days=max_history_days) ).replace(hour=0, minute=0, second=0, microsecond=0) - days_count = int((current_datetime - last_storage_usage_history_date).days) - days = range(1, days_count + 1) - usage_at_date = { - last_storage_usage_history_date - + timedelta(day): { - "total": Decimal(0), - "competitions": Decimal(0), - "users": Decimal(0), - "admin": Decimal(0), - } - for day in days - } - - for competition_data_point in CompetitionStorageDataPoint.objects.all().order_by( - "at_date" - ): - for date in usage_at_date: - if competition_data_point.at_date <= date: - if competition_data_point.datasets_total: - usage_at_date[date][ - "competitions" - ] += competition_data_point.datasets_total - - for user_data_point in UserStorageDataPoint.objects.all().order_by("at_date"): - for date in usage_at_date: - if user_data_point.at_date <= date: - if user_data_point.datasets_total: - usage_at_date[date]["users"] += user_data_point.datasets_total - if user_data_point.submissions_total: - usage_at_date[date]["users"] += user_data_point.submissions_total - - for admin_data_point in AdminStorageDataPoint.objects.all().order_by("at_date"): - for date in usage_at_date: - if admin_data_point.at_date <= date: - if admin_data_point.backups_total: - usage_at_date[date]["admin"] += admin_data_point.backups_total - if admin_data_point.others_total: - usage_at_date[date]["admin"] += admin_data_point.others_total - - for date, usages in usage_at_date.items(): - storage_usage_history_point = { - "bucket_name": bucket.name, - "at_date": date, - "total_usage": usages["total"], - "competitions_usage": usages["competitions"], - "users_usage": usages["users"], - "admin_usage": usages["admin"], - } - StorageUsageHistory.objects.create(**storage_usage_history_point) - - # Check for database <-> storage inconsistency - inconsistencies = {"database": [], "storage": []} + storage_usage_history_days_count = int( + (current_datetime - last_storage_usage_history_date).days + ) + storage_usage_history_days = range(1, storage_usage_history_days_count + 1) + storage_usage_history_day_range = [ + last_storage_usage_history_date + timedelta(day) + for day in range(1, storage_usage_history_days_count + 1) + ] # Database + nb_missing_files = 0 + + # Datasets for dataset in Data.objects.all().order_by("id"): if ( not dataset.data_file @@ -275,6 +322,9 @@ def create_storage_analytics_snapshot(): inconsistencies["database"].append( {"model": "dataset", "field": "data_file", "id": dataset.id} ) + nb_missing_files += 1 + + # Submissions for submission in Submission.objects.all().order_by("id"): if ( not submission.prediction_result @@ -288,6 +338,7 @@ def create_storage_analytics_snapshot(): "id": submission.id, } ) + nb_missing_files += 1 if ( not submission.scoring_result or not submission.scoring_result.name @@ -296,14 +347,18 @@ def create_storage_analytics_snapshot(): inconsistencies["database"].append( {"model": "submission", "field": "scoring_result", "id": submission.id} ) + nb_missing_files += 1 if ( - not submission.detailed_result - or not submission.detailed_result.name - or not BundleStorage.exists(submission.detailed_result.name) + submission.detailed_result + and submission.detailed_result.name + and not BundleStorage.exists(submission.detailed_result.name) ): inconsistencies["database"].append( {"model": "submission", "field": "detailed_result", "id": submission.id} ) + nb_missing_files += 1 + + # Submission details for submissiondetails in SubmissionDetails.objects.all().order_by("id"): if ( not submissiondetails.data_file @@ -317,50 +372,193 @@ def create_storage_analytics_snapshot(): "id": submissiondetails.id, } ) - + nb_missing_files += 1 + # Storage + nb_orphaned_files = 0 # In Bytes + orphaned_files_total_size = 0 # In bytes + orphaned_files_size_per_date = { + last_storage_usage_history_date + timedelta(day): 0 + for day in range(1, storage_usage_history_days_count + 1) + } + # Dataset - db_dataset_paths = Data.objects.values_list('data_file', flat=True) - storage_dataset_paths = [obj.key for obj in bucket.objects.filter(Prefix='dataset')] - orphaned_dataset_files = set(storage_dataset_paths) - set(db_dataset_paths) - inconsistencies["storage"] += list(orphaned_dataset_files) + db_dataset_paths = Data.objects.values_list("data_file", flat=True).distinct() + storage_dataset_paths = [ + obj.key for obj in BundleStorage.bucket.objects.filter(Prefix="dataset") + ] + orphaned_dataset_files = [ + x for x in storage_dataset_paths if x not in set(db_dataset_paths) + ] + nb_orphaned_files += len(orphaned_dataset_files) + for file in orphaned_dataset_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size # Detailed result - db_detailed_result_paths = Submission.objects.values_list('detailed_result', flat=True) - storage_detailed_result_paths = [obj.key for obj in bucket.objects.filter(Prefix='detailed_result')] - orphaned_detailed_result_files = set(storage_detailed_result_paths) - set(db_detailed_result_paths) - inconsistencies["storage"] += list(orphaned_detailed_result_files) + db_detailed_result_paths = Submission.objects.values_list( + "detailed_result", flat=True + ).distinct() + storage_detailed_result_paths = [ + obj.key for obj in BundleStorage.bucket.objects.filter(Prefix="detailed_result") + ] + orphaned_detailed_result_files = [ + x + for x in storage_detailed_result_paths + if x not in set(db_detailed_result_paths) + ] + nb_orphaned_files += len(orphaned_detailed_result_files) + for file in orphaned_detailed_result_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size # Prediction result - db_prediction_result_paths = Submission.objects.values_list('prediction_result', flat=True) - storage_prediction_result_paths = [obj.key for obj in bucket.objects.filter(Prefix='prediction_result')] - orphaned_prediction_result_files = set(storage_prediction_result_paths) - set(db_prediction_result_paths) - inconsistencies["storage"] += list(orphaned_prediction_result_files) + db_prediction_result_paths = Submission.objects.values_list( + "prediction_result", flat=True + ).distinct() + storage_prediction_result_paths = [ + obj.key + for obj in BundleStorage.bucket.objects.filter(Prefix="prediction_result") + ] + orphaned_prediction_result_files = [ + x + for x in storage_prediction_result_paths + if x not in set(db_prediction_result_paths) + ] + nb_orphaned_files += len(orphaned_prediction_result_files) + for file in orphaned_prediction_result_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size # Scoring result - db_scoring_result_paths = Submission.objects.values_list('scoring_result', flat=True) - storage_scoring_result_paths = [obj.key for obj in bucket.objects.filter(Prefix='scoring_result')] - orphaned_scoring_result_files = set(storage_scoring_result_paths) - set(db_scoring_result_paths) - inconsistencies["storage"] += list(orphaned_scoring_result_files) + db_scoring_result_paths = Submission.objects.values_list( + "scoring_result", flat=True + ).distinct() + storage_scoring_result_paths = [ + obj.key for obj in BundleStorage.bucket.objects.filter(Prefix="scoring_result") + ] + orphaned_scoring_result_files = [ + x for x in storage_scoring_result_paths if x not in set(db_scoring_result_paths) + ] + nb_orphaned_files += len(orphaned_scoring_result_files) + for file in orphaned_scoring_result_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size # Submission details - db_submission_details_paths = SubmissionDetails.objects.values_list('data_file', flat=True) - storage_submission_details_paths = [obj.key for obj in bucket.objects.filter(Prefix='submission_details')] - orphaned_submission_details_files = set(storage_submission_details_paths) - set(db_submission_details_paths) - inconsistencies["storage"] += list(orphaned_submission_details_files) + db_submission_details_paths = SubmissionDetails.objects.values_list( + "data_file", flat=True + ).distinct() + storage_submission_details_paths = [ + obj.key + for obj in BundleStorage.bucket.objects.filter(Prefix="submission_details") + ] + orphaned_submission_details_files = [ + x + for x in storage_submission_details_paths + if x not in set(db_submission_details_paths) + ] + nb_orphaned_files += len(orphaned_submission_details_files) + for file in orphaned_submission_details_files: + size = BundleStorage.size(file) + last_modified = BundleStorage.get_modified_time(file) + inconsistencies["storage"].append({"path": file, "size": size}) + orphaned_files_total_size += size + for date in storage_usage_history_day_range: + if last_modified < date: + orphaned_files_size_per_date[date] += size # Log the results - log_file = "/app/logs/" + "db_storage_inconsistency_" + current_datetime.strftime("%Y%m%d-%H%M%S") + ".log" - with open(log_file, 'w') as file: - file.write('Database <---> Storage Inconsistency\n\n') - file.write(f'Bucket: {bucket.name}\n') - file.write(f'Datetime: {current_datetime.isoformat()}\n\n') - file.write(f'Missing files:\n') + log_file = ( + "/app/logs/" + + "db_storage_inconsistency_" + + current_datetime.strftime("%Y%m%d-%H%M%S") + + ".log" + ) + with open(log_file, "w") as file: + file.write("Database <---> Storage Inconsistency\n\n") + file.write(f"Bucket: {BundleStorage.bucket.name}\n") + file.write(f"Datetime: {current_datetime.isoformat()}\n\n") + file.write(f"Missing files: {nb_missing_files} files\n") for missing_file in inconsistencies["database"]: - file.write(f'{missing_file["model"]} of id={missing_file["id"]} is missing its {missing_file["field"]}\n') - file.write(f'\nOrphaned files:\n') + file.write( + f'{missing_file["model"]} of id={missing_file["id"]} is missing its {missing_file["field"]}\n' + ) + file.write( + f"\nOrphaned files: {nb_orphaned_files} files for a total of {pretty_bytes(orphaned_files_total_size)} ({orphaned_files_total_size}B)\n" + ) for orphaned_file in inconsistencies["storage"]: - file.write(f'{orphaned_file}\n') + file.write( + f'{orphaned_file["path"]} {pretty_bytes(orphaned_file["size"])} ({orphaned_file["size"]}B)\n' + ) + + # Save the storage usage history points + for date in [ + last_storage_usage_history_date + timedelta(day) + for day in storage_usage_history_days + ]: + competitions_usage = ( + competitions_datasets.filter(day__lt=date).aggregate(total=Sum("size"))[ + "total" + ] + or 0 + ) + users_usage = ( + ( + users_datasets.filter(day__lt=date).aggregate(total=Sum("size"))[ + "total" + ] + or 0 + ) + + ( + users_submissions.filter(day__lt=date).aggregate(total=Sum("size"))[ + "total" + ] + or 0 + ) + + ( + users_submissions_details.filter(day__lt=date).aggregate( + total=Sum("size") + )["total"] + or 0 + ) + ) + admin_data_point = AdminStorageDataPoint.objects.filter(at_date=date).first() + admin_usage = (admin_data_point.backups_total or 0) if admin_data_point else 0 + orphaned_file_usage = Decimal(orphaned_files_size_per_date[date] / 1024) + total_usage = ( + users_usage + admin_usage + orphaned_file_usage + ) # competitions_usage is included inside users_usage + storage_usage_history_point = { + "bucket_name": BundleStorage.bucket.name, + "total_usage": total_usage, + "competitions_usage": competitions_usage, + "users_usage": users_usage, + "admin_usage": admin_usage, + "orphaned_file_usage": orphaned_file_usage, + "at_date": date, + } + StorageUsageHistory.objects.create(**storage_usage_history_point) # Stop the count! elapsed_time = time.process_time() - starting_time diff --git a/src/apps/api/urls.py b/src/apps/api/urls.py index 0bb521b3e..8702aa6c2 100644 --- a/src/apps/api/urls.py +++ b/src/apps/api/urls.py @@ -62,6 +62,10 @@ path('delete_unused_submissions/', quota.delete_unused_submissions, name="delete_unused_submissions"), path('delete_failed_submissions/', quota.delete_failed_submissions, name="delete_failed_submissions"), + # Analytics + path('analytics/storage_usage_history/', analytics.storage_usage_history, name='storage_usage_history'), + path('analytics/competitions_usage/', analytics.competitions_usage, name='competitions_usage'), + # API Docs re_path(r'docs(?P\.json|\.yaml)$', schema_view.without_ui(cache_timeout=0), name='schema-json'), path('docs/', schema_view.with_ui('swagger', cache_timeout=0), name='schema-swagger-ui'), diff --git a/src/apps/api/views/analytics.py b/src/apps/api/views/analytics.py index 084b042e7..05e54c27a 100644 --- a/src/apps/api/views/analytics.py +++ b/src/apps/api/views/analytics.py @@ -1,12 +1,16 @@ from django.db.models import Count, F from django.contrib.auth import get_user_model from django.http import Http404 +from rest_framework import status +from rest_framework.exceptions import PermissionDenied from rest_framework.views import APIView from rest_framework.response import Response from rest_framework.renderers import JSONRenderer from rest_framework.filters import BaseFilterBackend +from rest_framework.decorators import api_view from rest_framework_csv import renderers as r from competitions.models import Competition, Submission +from analytics.models import StorageUsageHistory, CompetitionStorageDataPoint from api.serializers.analytics import AnalyticsSerializer import datetime @@ -158,3 +162,72 @@ def get(self, request): 'end_date': end_date, 'time_unit': time_unit, }) + +@api_view(["GET"]) +def storage_usage_history(request): + """ + Gets the storage usage timeline between the 2 provided dates at the given resolution + """ + if not request.user.is_superuser: + raise PermissionDenied(detail="Admin only") + + storage_usage_history = {} + last_storage_usage_history_snapshot = StorageUsageHistory.objects.order_by("at_date").last() + if last_storage_usage_history_snapshot: + start_date = request.query_params.get("start_date", (datetime.datetime.today() - datetime.timedelta(weeks=4)).strftime("%Y-%m-%d")) + end_date = request.query_params.get("end_date", datetime.datetime.today().strftime("%Y-%m-%d")) + resolution = request.query_params.get("resolution", "day") + + query = StorageUsageHistory.objects.filter( + bucket_name=last_storage_usage_history_snapshot.bucket_name, + at_date__range=(start_date, end_date), + ).dates("at_date", resolution).values() + for su in query.order_by("-at_date"): + storage_usage_history[su['datefield'].isoformat()] = { + 'total_usage': su['total_usage'], + 'competitions_usage': su['competitions_usage'], + 'users_usage': su['users_usage'], + 'admin_usage': su['admin_usage'], + 'orphaned_file_usage': su['orphaned_file_usage'] + } + + return Response(storage_usage_history, status=status.HTTP_200_OK) + + +@api_view(["GET"]) +def competitions_usage(request): + """ + Gets the competitions usage between the 2 provided dates at the given resolution + """ + if not request.user.is_superuser: + raise PermissionDenied(detail="Admin only") + + competitions_usage = {} + last_competition_storage_snapshot = CompetitionStorageDataPoint.objects.order_by("at_date").last() + if last_competition_storage_snapshot: + start_date = request.query_params.get("start_date", (datetime.datetime.today() - datetime.timedelta(weeks=4)).strftime("%Y-%m-%d")) + end_date = request.query_params.get("end_date", datetime.datetime.today().strftime("%Y-%m-%d")) + resolution = request.query_params.get("resolution", "day") + + query = CompetitionStorageDataPoint.objects.filter( + at_date__range=(start_date, end_date), + ).dates("at_date", resolution).values( + 'id', + 'competition__id', + 'competition__title', + 'competition__created_by__username', + 'competition__created_by__email', + 'competition__created_when', + 'datasets_total', + 'datefield' + ) + for su in query.order_by("-datefield", "competition__id"): + competitions_usage.setdefault(su['datefield'].isoformat(), {})[su['competition__id']] = { + 'snapshot_id': su['id'], + 'title': su['competition__title'], + 'organizer': su['competition__created_by__username'] + " (" + su['competition__created_by__email'] + ")", + 'created_when': su['competition__created_when'], + 'datasets': su['datasets_total'], + } + + return Response(competitions_usage, status=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/apps/competitions/migrations/0035_file_size.py b/src/apps/competitions/migrations/0035_auto_20230914_1319.py similarity index 95% rename from src/apps/competitions/migrations/0035_file_size.py rename to src/apps/competitions/migrations/0035_auto_20230914_1319.py index 33d7d668a..60e6cd96a 100644 --- a/src/apps/competitions/migrations/0035_file_size.py +++ b/src/apps/competitions/migrations/0035_auto_20230914_1319.py @@ -1,4 +1,4 @@ -# Generated by Django 2.2.17 on 2023-08-10 15:17 +# Generated by Django 2.2.17 on 2023-09-14 13:19 from django.db import migrations, models diff --git a/src/settings/base.py b/src/settings/base.py index 882fca340..fc5760895 100644 --- a/src/settings/base.py +++ b/src/settings/base.py @@ -224,9 +224,13 @@ 'task': 'competitions.tasks.submission_status_cleanup', 'schedule': timedelta(seconds=3600) }, + # 'create_storage_analytics_snapshot': { + # 'task': 'analytics.tasks.create_storage_analytics_snapshot', + # 'schedule': crontab(hour='2', minute='0', day_of_week='sun') # Every Sunday at 02:00 UTC time + # }, 'create_storage_analytics_snapshot': { 'task': 'analytics.tasks.create_storage_analytics_snapshot', - 'schedule': crontab(hour='2', minute='0', day_of_week='sun') # Every Sunday at 02:00 UTC time + 'schedule': crontab(hour='16', minute='24') }, 'reset_computed_storage_analytics': { 'task': 'analytics.tasks.reset_computed_storage_analytics', diff --git a/src/static/js/ours/client.js b/src/static/js/ours/client.js index 764fb109a..2d9484889 100644 --- a/src/static/js/ours/client.js +++ b/src/static/js/ours/client.js @@ -308,6 +308,12 @@ CODALAB.api = { get_analytics: (filters) => { return CODALAB.api.request('GET', `${URLS.API}analytics/`, filters) }, + get_storage_usage_history: (filters) => { + return CODALAB.api.request('GET', `${URLS.API}analytics/storage_usage_history/`, filters); + }, + get_competitions_usage: (filters) => { + return CODALAB.api.request('GET', `${URLS.API}analytics/competitions_usage/`, filters); + }, /*--------------------------------------------------------------------- User Quota and Cleanup ---------------------------------------------------------------------*/ diff --git a/src/static/js/ours/utils.js b/src/static/js/ours/utils.js index 7407c5aa2..8cfd4b4fb 100644 --- a/src/static/js/ours/utils.js +++ b/src/static/js/ours/utils.js @@ -89,6 +89,17 @@ function pretty_date(date_string) { } } +function pretty_bytes(bytes, decimal_places=1, suffix="B") { + const units = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']; + for (const unit of units) { + if (Math.abs(bytes) < 1024.0 || unit == 'PiB') { + return bytes.toFixed(decimal_places) + unit + suffix; + } + bytes /= 1024.0; + } + return bytes.toFixed(decimal_places) + "Pi" + suffix; +} + /* ---------------------------------------------------------------------------- Form data helpers ----------------------------------------------------------------------------*/ diff --git a/src/static/riot/analytics/analytics.tag b/src/static/riot/analytics/analytics.tag index a1b89f8a3..07ddf4c0e 100644 --- a/src/static/riot/analytics/analytics.tag +++ b/src/static/riot/analytics/analytics.tag @@ -1,14 +1,19 @@

Analytics

+ +

Date Range

-
-