Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
65c8af8
front end changes to show datasets menu item, other items moved to ac…
ihsaan-ullah Jun 28, 2025
013b7a3
submission changes reverted
ihsaan-ullah Jun 28, 2025
dfb0b5f
Merge branch 'develop' into datasets
ihsaan-ullah Jul 24, 2025
5ca91ec
datasets model updates, added frontend for datasets, added backend fo…
ihsaan-ullah Jul 24, 2025
fd3556d
create buttons added to datasets and benchmarks pages, new datasets c…
ihsaan-ullah Jul 25, 2025
1c34ffc
minor style changes, added create and upload buttons to public benchm…
ihsaan-ullah Jul 25, 2025
d041045
dataset detail page added, serializer updated, backend updated for da…
ihsaan-ullah Jul 29, 2025
341a968
secure download option added in dataset details
ihsaan-ullah Jul 30, 2025
64d62ca
pagination fixed
ihsaan-ullah Jul 30, 2025
e2cd58a
datasets create page added, minor updates to pretty_bytes function, r…
ihsaan-ullah Jul 30, 2025
69807ac
tests added for public datasets, dataset detail, and dataset download
ihsaan-ullah Jul 30, 2025
15d739d
selenium test fix
ihsaan-ullah Jul 30, 2025
84a8712
minor text updates
ihsaan-ullah Jul 31, 2025
cef5d55
Merge pull request #2012 from codalab/develop
Didayolo Sep 25, 2025
ed9a9dd
* added for required fields, make public checkbox added
ihsaan-ullah Sep 29, 2025
c22da57
dataset create tests added. Validation added for file_size field in c…
ihsaan-ullah Sep 30, 2025
8517601
create dataset test updated to check for is_pubilc status of dataset
ihsaan-ullah Sep 30, 2025
adc47ed
flake8 fixes
ihsaan-ullah Sep 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions src/apps/api/serializers/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,28 @@ def create(self, validated_data):
return instance


class DatasetSerializer(serializers.ModelSerializer):
created_by = serializers.SerializerMethodField()

class Meta:
model = Data
fields = (
'id',
'type',
'name',
'description',
'file_size',
'license',
'downloads',
'is_verified',
'created_when',
'created_by',
)

def get_created_by(self, obj):
return obj.created_by.username


class DataSimpleSerializer(serializers.ModelSerializer):

class Meta:
Expand Down
220 changes: 220 additions & 0 deletions src/apps/api/tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from django.urls import reverse
from faker import Factory
from django.test import TestCase
from rest_framework.test import APITestCase
from datasets.models import Data
from factories import UserFactory, DataFactory
from utils.data import pretty_bytes, gb_to_bytes
from unittest.mock import patch


faker = Factory.create()
Expand Down Expand Up @@ -86,3 +88,221 @@ def test_dataset_api_check_quota(self):
'file_size': file_size,
})
assert resp.status_code == 201


class DatasetDetailTests(TestCase):
def setUp(self):
self.owner = UserFactory(username="owner")
self.other_user = UserFactory(username="other")
self.client.force_login(self.owner)

# Create datasets
self.public_dataset = DataFactory(
name="Public Dataset",
is_public=True,
created_by=self.owner,
type=Data.PUBLIC_DATA
)

self.private_dataset = DataFactory(
name="Private Dataset",
is_public=False,
created_by=self.owner,
type=Data.INPUT_DATA
)

self.other_private_dataset = DataFactory(
name="Other User's Private Dataset",
is_public=False,
created_by=self.other_user,
type=Data.REFERENCE_DATA
)

def test_view_public_dataset(self):
# Public dataset should be accessible by anyone
self.client.logout()
response = self.client.get(reverse("datasets:detail", args=[self.public_dataset.pk]))
self.assertEqual(response.status_code, 200)

def test_view_private_dataset_as_owner(self):
# Owner should be able to access their own private dataset
response = self.client.get(reverse("datasets:detail", args=[self.private_dataset.pk]))
self.assertEqual(response.status_code, 200)

def test_view_private_dataset_as_other_user(self):
# Another user should not be able to access a private dataset
self.client.force_login(self.other_user)
response = self.client.get(reverse("datasets:detail", args=[self.private_dataset.pk]))
self.assertEqual(response.status_code, 404)

def test_view_nonexistent_dataset(self):
# Accessing a non-existent dataset should return 404
response = self.client.get(reverse("datasets:detail", args=[99999]))
self.assertEqual(response.status_code, 404)


class DatasetDownloadTests(TestCase):
def setUp(self):
self.owner = UserFactory(username="owner")
self.other_user = UserFactory(username="other")
self.client.force_login(self.owner)

self.public_dataset = DataFactory(
is_public=True,
created_by=self.owner,
downloads=5
)

self.private_dataset = DataFactory(
is_public=False,
created_by=self.owner,
downloads=2
)

@patch("datasets.views.make_url_sassy") # Replaces the real `make_url_sassy` function in this test only
def test_download_public_dataset(self, mock_make_url_sassy):
# Mock the URL that would normally be generated for the file
# This avoids depending on actual file storage or signature logic
mock_make_url_sassy.return_value = "http://codebench-storage/public_dataset.zip"

response = self.client.get(reverse("datasets:download_by_pk", args=[self.public_dataset.pk]))

# Should redirect to the URL
self.assertEqual(response.status_code, 302)
self.assertEqual(response["Location"], "http://codebench-storage/public_dataset.zip")

# Should increment download count
self.public_dataset.refresh_from_db()
self.assertEqual(self.public_dataset.downloads, 6)

@patch("datasets.views.make_url_sassy") # Replaces the real `make_url_sassy` function in this test only
def test_download_private_dataset_as_owner(self, mock_make_url_sassy):
# Mock the URL that would normally be generated for the file
# This avoids depending on actual file storage or signature logic
mock_make_url_sassy.return_value = "http://codebench-storage/private_dataset.zip"

response = self.client.get(reverse("datasets:download_by_pk", args=[self.private_dataset.pk]))

self.assertEqual(response.status_code, 302)
self.assertEqual(response["Location"], "http://codebench-storage/private_dataset.zip")

self.private_dataset.refresh_from_db()
self.assertEqual(self.private_dataset.downloads, 3)

def test_download_private_dataset_as_other_user(self):
# Authenticate as a different user who is not the owner
self.client.force_login(self.other_user)

response = self.client.get(reverse("datasets:download_by_pk", args=[self.private_dataset.pk]))

# Should return 404 (access denied)
self.assertEqual(response.status_code, 404)

def test_download_nonexistent_dataset(self):
response = self.client.get(reverse("datasets:download_by_pk", args=[99999]))

# Should return 404 (access denied)
self.assertEqual(response.status_code, 404)


class DatasetCreateTests(APITestCase):
def setUp(self):
self.user = UserFactory(username='creator', password='creator')
self.client.login(username='creator', password='creator')

@patch("api.views.datasets.make_url_sassy") # Replaces the real `make_url_sassy` function in this test only
def test_create_dataset_success(self, mock_make_url_sassy):
fake_sassy_url = "https://codabench-storage/dataset.zip"
mock_make_url_sassy.return_value = fake_sassy_url

# Case 1: Without is_public (should default to False)
resp = self.client.post(reverse("data-list"), {
'name': 'my-new-dataset',
'type': Data.PUBLIC_DATA,
'request_sassy_file_name': faker.file_name(extension='.zip'),
'file_size': 1234,
'file_name': faker.file_name(),
})
self.assertEqual(resp.status_code, 201)
self.assertIn("key", resp.data)
self.assertEqual(resp.data["sassy_url"], fake_sassy_url)

dataset = Data.objects.get(name="my-new-dataset")
self.assertEqual(dataset.created_by, self.user)
self.assertFalse(dataset.is_public)
mock_make_url_sassy.assert_called_once_with(dataset.data_file.name, 'w')

# Case 2: With is_public=True
mock_make_url_sassy.reset_mock()
resp = self.client.post(reverse("data-list"), {
'name': 'my-public-dataset',
'type': Data.PUBLIC_DATA,
'request_sassy_file_name': faker.file_name(extension='.zip'),
'file_size': 1234,
'file_name': faker.file_name(),
'is_public': True
})
self.assertEqual(resp.status_code, 201)
dataset = Data.objects.get(name="my-public-dataset")
self.assertTrue(dataset.is_public)
mock_make_url_sassy.assert_called_once_with(dataset.data_file.name, 'w')

def test_cannot_create_dataset_with_missing_fields(self):

# missing file_size
resp = self.client.post(reverse("data-list"), {
'name': 'incomplete-dataset',
'file_name': faker.file_name(),
'type': Data.PUBLIC_DATA,
'request_sassy_file_name': faker.file_name(extension='.zip'),

})
self.assertEqual(resp.status_code, 400)
self.assertIn("file_size", resp.data)
self.assertEqual(resp.data["file_size"], "This field is required.")

# missing request_sassy_file_name
resp = self.client.post(reverse("data-list"), {
'name': 'incomplete-dataset',
'file_name': faker.file_name(),
'type': Data.PUBLIC_DATA,
'file_size': 1234,
})
self.assertEqual(resp.status_code, 400)
self.assertIn("request_sassy_file_name", resp.data)
self.assertEqual(resp.data["request_sassy_file_name"][0], "This field is required.")

# missing type
resp = self.client.post(reverse("data-list"), {
'name': 'incomplete-dataset',
'file_name': faker.file_name(),
'file_size': 1234,
'request_sassy_file_name': faker.file_name(extension='.zip'),
})
self.assertEqual(resp.status_code, 400)
self.assertIn("type", resp.data)
self.assertEqual(resp.data["type"][0], "This field is required.")

def test_cannot_create_dataset_with_invalid_file_size(self):
resp = self.client.post(reverse("data-list"), {
'name': 'invalid-size-dataset',
'file_name': faker.file_name(),
'type': Data.PUBLIC_DATA,
'request_sassy_file_name': faker.file_name(),
'file_size': "not-a-number", # invalid type
})

self.assertEqual(resp.status_code, 400)
self.assertIn("file_size", resp.data)
self.assertEqual(resp.data["file_size"][0], "A valid number is required.")

def test_cannot_create_dataset_unauthenticated(self):
self.client.logout()
resp = self.client.post(reverse("data-list"), {
'name': 'unauth-dataset',
'file_name': faker.file_name(),
'type': Data.PUBLIC_DATA,
'request_sassy_file_name': faker.file_name(),
'file_size': 1234,
})
self.assertEqual(resp.status_code, 403)
106 changes: 106 additions & 0 deletions src/apps/api/tests/test_public_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from rest_framework.test import APIClient
from django.test import TestCase
from factories import UserFactory, DataFactory
from datasets.models import Data


class PublicDatasetsTests(TestCase):
def setUp(self):
# Set up test client and authenticate as a test user
self.client = APIClient()
self.user = UserFactory()
self.client.force_authenticate(user=self.user)

# Create public datasets with varying metadata to test filters and sorting
self.dataset1 = DataFactory(
name="Climate Data",
description="Temperature and rainfall records",
is_public=True,
type=Data.PUBLIC_DATA,
license="MIT",
is_verified=True,
downloads=10
)

self.dataset2 = DataFactory(
name="Vision Dataset",
description="Images for computer vision",
is_public=True,
type=Data.PUBLIC_DATA,
license=None,
is_verified=False,
downloads=25
)

self.dataset3 = DataFactory(
name="Unverified Text",
description="NLP dataset",
is_public=True,
type=Data.PUBLIC_DATA,
license="Apache 2.0",
is_verified=False,
downloads=5
)

self.dataset4 = DataFactory(
name="Recent Genomics",
description="DNA sequences",
is_public=True,
type=Data.PUBLIC_DATA,
downloads=40,
is_verified=True
)

def test_default_ordering_recently_added(self):
# Test default ordering by ID in descending order (most recently created datasets first)
response = self.client.get("/api/datasets/public/")
self.assertEqual(response.status_code, 200)
ids = [d["id"] for d in response.data["results"]]
self.assertEqual(ids, sorted(ids, reverse=True)) # Default ordering by -id

def test_ordering_by_most_downloaded(self):
# Test ordering datasets by download count in descending order
response = self.client.get("/api/datasets/public/?ordering=most_downloaded")
self.assertEqual(response.status_code, 200)
downloads = [d["downloads"] for d in response.data["results"]]
self.assertEqual(downloads, sorted(downloads, reverse=True))

def test_filter_by_search_term(self):
# Test full-text search in dataset name and description
response = self.client.get("/api/datasets/public/?search=vision")
self.assertEqual(response.status_code, 200)
names = [d["name"].lower() + d["description"].lower() for d in response.data["results"]]
self.assertTrue(all("vision" in text for text in names))

def test_filter_by_has_license_true(self):
# Test filtering datasets that have a license field set
response = self.client.get("/api/datasets/public/?has_license=true")
self.assertEqual(response.status_code, 200)
results = response.data["results"]
self.assertTrue(all(d["license"] is not None for d in results))

def test_filter_by_is_verified_true(self):
# Test filtering datasets that are verified (is_verified=True)
response = self.client.get("/api/datasets/public/?is_verified=true")
self.assertEqual(response.status_code, 200)
results = response.data["results"]
self.assertTrue(all(d["is_verified"] is True for d in results))

def test_combined_filter_verified_with_license(self):
# Test filtering datasets that are both verified and have a license
response = self.client.get("/api/datasets/public/?is_verified=true&has_license=true")
self.assertEqual(response.status_code, 200)
results = response.data["results"]
for d in results:
self.assertTrue(d["is_verified"])
self.assertIsNotNone(d["license"])

def test_combined_search_and_filter(self):
# Test applying both search and is_verified filters together
response = self.client.get("/api/datasets/public/?search=genomics&is_verified=true")
self.assertEqual(response.status_code, 200)
results = response.data["results"]

# Expect exactly one match with the correct name
self.assertEqual(len(results), 1)
self.assertEqual(results[0]["name"], "Recent Genomics")
Loading