plinder-org
diff --git a/‎.github/workflows/main.yaml‎
Lines changed: 11 additions & 1 deletion b/‎.github/workflows/main.yaml‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 105 additions & 0 deletions b/‎CITATION.cff‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎LICENSE.txt‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NOTICE‎
Lines changed: 10 additions & 0 deletions b/‎NOTICE‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 44 additions & 27 deletions b/‎README.md‎
Lines changed: 44 additions & 27 deletions
diff --git a/‎assets/plinder.png‎
-86.3 KB b/‎assets/plinder.png‎
-86.3 KB
diff --git a/‎assets/plinder_test_stratification.png‎
962 KB b/‎assets/plinder_test_stratification.png‎
962 KB
diff --git a/‎assets/workflow.png‎
-602 KB b/‎assets/workflow.png‎
-602 KB
diff --git a/‎docs/Makefile‎
Lines changed: 20 additions & 0 deletions b/‎docs/Makefile‎
Lines changed: 20 additions & 0 deletions
@@ -38,5 +38,15 @@ jobs:
         if: steps.get-tag.outputs.bump != ''
         run: python flows/docker.py test --push
       - name: Save git tag
-        if: steps.get-tag.outputs.bump != 'skip'
+        if: steps.get-tag.outputs.bump != ''
         run: git push origin ${{ steps.get-tag.outputs.bump }}
+      - name: Copy and surgery coverage
+        if: steps.get-tag.outputs.bump != ''
+        run: |
+          cp reports/.coverage .
+          sqlite3 .coverage "update file set path='src/' || substr(path, 40);"
+      - name: Post coverage comment
+        if: steps.get-tag.outputs.bump != ''
+        uses: py-cov-action/python-coverage-comment-action@v3
+        with:
+          GITHUB_TOKEN: ${{ github.token }}
@@ -88,6 +88,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/source/pinder*
 
 # PyBuilder
 .pybuilder/
 
@@ -0,0 +1,105 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Durairaj"
+  given-names: "Janani"
+- family-names: "Adeshina"
+  given-names: "Yusuf"
+- family-names: "Cao"
+  given-names: "Zhonglin"
+- family-names: "Zhang"
+  given-names: "Xuejin"
+- family-names: "Oleinikovas"
+  given-names: "Vladas"
+- family-names: "Duignan"
+  given-names: "Thomas"
+- family-names: "McClure"
+  given-names: "Zachary"
+- family-names: "Robin"
+  given-names: "Xavier"
+- family-names: "Rossi"
+  given-names: "Emanuele"
+- family-names: "Zhou"
+  given-names: "Guoqing"
+- family-names: "Veccham"
+  given-names: "Srimukh"
+- family-names: "Isert"
+  given-names: "Clemens"
+- family-names: "Peng"
+  given-names: "Yuxing"
+- family-names: "Sundareson"
+  given-names: "Prabindh"
+- family-names: "Akdel"
+  given-names: "Mehmet"
+- family-names: "Corso"
+  given-names: "Gabriele"
+- family-names: "Stärk"
+  given-names: "Hannes"
+- family-names: "Carpenter"
+  given-names: "Zachary"
+- family-names: "Bronstein"
+  given-names: "Michael"
+- family-names: "Kucukbenli"
+  given-names: "Emine"
+- family-names: "Schwede"
+  given-names: "Torsten"
+- family-names: "Naef"
+  given-names: "Luca"
+title: "PLINDER: The Protein-Ligand Interactions Dataset and Evaluation Resource"
+doi: 10.1101/2024.07.17.603955
+version: 0.0.1
+date-released: 2024-07-17
+url: "https://github.com/plinder-org/plinder"
+preferred-citation:
+  type: conference-paper
+  authors:
+  - family-names: "Durairaj"
+    given-names: "Janani"
+  - family-names: "Adeshina"
+    given-names: "Yusuf"
+  - family-names: "Cao"
+    given-names: "Zhonglin"
+  - family-names: "Zhang"
+    given-names: "Xuejin"
+  - family-names: "Oleinikovas"
+    given-names: "Vladas"
+  - family-names: "Duignan"
+    given-names: "Thomas"
+  - family-names: "McClure"
+    given-names: "Zachary"
+  - family-names: "Robin"
+    given-names: "Xavier"
+  - family-names: "Rossi"
+    given-names: "Emanuele"
+  - family-names: "Zhou"
+    given-names: "Guoqing"
+  - family-names: "Veccham"
+    given-names: "Srimukh"
+  - family-names: "Isert"
+    given-names: "Clemens"
+  - family-names: "Peng"
+    given-names: "Yuxing"
+  - family-names: "Sundareson"
+    given-names: "Prabindh"
+  - family-names: "Akdel"
+    given-names: "Mehmet"
+  - family-names: "Corso"
+    given-names: "Gabriele"
+  - family-names: "Stärk"
+    given-names: "Hannes"
+  - family-names: "Carpenter"
+    given-names: "Zachary"
+  - family-names: "Bronstein"
+    given-names: "Michael"
+  - family-names: "Kucukbenli"
+    given-names: "Emine"
+  - family-names: "Schwede"
+    given-names: "Torsten"
+  - family-names: "Naef"
+    given-names: "Luca"
+  doi: "10.1101/2024.07.17.603955"
+  journal: "bioRxiv"
+  eventtitle: "Machine Learning for Life and Material Science, ICML 2024"
+  month: 7
+  title: "PLINDER: The Protein-Ligand Interactions Dataset and Evaluation Resource"
+  year: 2024
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2024 VantAI, Inc.
+   Copyright 2024 Plinder Development Team
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
 
@@ -0,0 +1,10 @@
+PLINDER - The Protein Ligand INteractions Dataset and Evaluation Resource
+Copyright (c) 2024, Plinder Development Team
+
+The PLINDER project is a collaboration between the
+University of Basel, SIB Swiss Institute of Bioinformatics,
+VantAI, NVIDIA, and MIT CSAIL.
+
+If you find this software useful, please cite:
+
+Durairaj, Janani, Yusuf Adeshina, Zhonglin Cao, Xuejin Zhang, Vladas Oleinikovas, Thomas Duignan, Zachary McClure, et al. “PLINDER: The Protein-Ligand Interactions Dataset and Evaluation Resource.” bioRxiv, July 17, 2024, 2024.07.17.603955. https://doi.org/10.1101/2024.07.17.603955.
@@ -1,4 +1,4 @@
-![plinder](./assets/plinder.png)
+![plinder](https://github.com/user-attachments/assets/05088c51-36c8-48c6-a7b2-8a69bd40fb44)
 
 <div align="center">
     <h1>The Protein Ligand INteractions Dataset and Evaluation Resource</h1>
@@ -8,8 +8,23 @@
 
 [![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/plinder-org/plinder/blob/master/LICENSE.txt)
 [![test](https://github.com/plinder-org/plinder/actions/workflows/pr.yaml/badge.svg)](https://github.com/plinder-org/plinder/actions/workflows/pr.yaml)
-[![Coverage badge](https://github.com/plinder-org/plinder/raw/python-coverage-comment-action-data/badge.svg)](https://github.com/plinder-org/plinder/tree/python-coverage-comment-action-data)
+[![coverage](https://github.com/plinder-org/plinder/raw/python-coverage-comment-action-data/badge.svg)](https://github.com/plinder-org/plinder/tree/python-coverage-comment-action-data)
 
+![overview](https://github.com/user-attachments/assets/39d251b1-8114-4242-b9fc-e0cce900d22f)
+
+# 📚 About
+
+**plinder**, short for **p**rotein **l**igand **in**teractions **d**ataset and **e**valuation **r**esource,
+is a dataset and resource for training and evaluation of protein-ligand docking algorithms.
+It is a comprehensive, annotated, high quality dataset:
+
+- \> 400k PLI systems across > 11k SCOP domains and > 50k unique small molecules
+- 500+ annotations for each system, including protein and ligand properties, quality, matched molecular series and more
+- Automated curation pipeline to keep up with the PDB
+- 14 PLI metrics and over 20 billion similarity scores
+- Unbound \(_apo_\) and _predicted_ Alphafold2 structures linked to _holo_ systems
+- `train-val-test` splits and ability to tune splitting based on the learning task
+- Robust evaluation harness to simplify and standard performance comparison between models
 
 # 📢 Notice
 
@@ -19,32 +34,16 @@ VantAI, NVIDIA, MIT CSAIL, and the community at large.
 If you find `plinder` useful,
 please see the citation file for details on how to cite.
 
-# 🚧 Under construction
-
-Please bear with us as we migrate the `plinder` project to
-open source as we work to share it with the world. There are
-some gaps in the code and documentation, which will be fixed
-as soon as possible. The dataset itself is complete, but the
-code to interact with some parts of the dataset is still under
-development.
-
-# 📚 About
-
-**plinder**, short for **p**rotein **l**igand **in**teractions **d**ataset and **e**valuation **r**esource,
-is a dataset and resource for training and evaluation of protein-ligand docking algorithms.
-
 # 👨💻 Getting Started
 
 Please use a virtual environment for the `plinder` project.
 We recommend the [miniforge](https://github.com/conda-forge/miniforge) environment manager.
 
-
 **NOTE**: We currently only support a Linux environment. `plinder`
 uses `openstructure` for some of its functionality and is available
 from the `aivant` conda channel using `conda install aivant::openstructure`, but it is only built targeting Linux architectures.
 For MacOS users, please see the relevant [docker](#package-publishing) resources below.
 
-
 ## Install plinder
 
 The `plinder` package can be obtained from GitHub:
@@ -60,8 +59,7 @@ Or with a development installation:
     cd plinder
     pip install -e '.[dev]'
 
-
-# ⬇️  Getting the dataset
+# ⬇️ Getting the dataset
 
 Using the `plinder.core` API, you can transparently and lazily
 download and interact with most of the components of the dataset.
@@ -109,21 +107,37 @@ with the dataset.
 
 ## 🏅 Gold standard benchmark sets
 
-Discuss stratification efforts
+As part of `plinder` resource we also provide train, validation and test splits that are curated to minimize the information leakage based on protein-ligand interaction similarity. In addition, we have prioritized the systems that has a linked experimental `apo` structure or matched molecular series to support realistic inference scenarios for hit discovery and optimization.
+Finally, a particular care is taken for test set that is further prioritized to contain high quality structures to provide unambiguous ground-truths for performance benchmarking.
+
+![plinder](./assets/plinder_test_stratification.png)
+
+Moreover, as we enticipate this resource to be used for benchmarking a wide range of methods, including those simultaneously predicting protein structure (aka. co-folding) or those generating novel ligand structures, we further stratified test (by novel ligand, pocket, protein or all) to cover a wide range of tasks.
+
+Our latest test split [#TODO] contains:
+
+| Novel   |   # of systems | # of high quality |  stratification criteria |
+|:--------|---------------:|------------------:|:---------------:|
+| pocket  | 5206 | 5203 | PLI shared < 50 _&_  Pocket shared lDDT < 0.5 |
+| ligand  | 2395 | 2395 | ECFP4 fingerprint similarity < 0.3 |
+| protein |  983 |  983 | Protein Seq. Sim. < 0.3 _&_ Protein lDDT > 0.7 |
+| all     |  268 |  268 | all of the above |
+| none    |    0 |    0 | none of the above |
+
 
 ## 🧪 Training set
 
 Discuss the splits
 
-## ⚖️  Evaluation harness
+## ⚖️ Evaluation harness
 
 See the [`plinder.eval`](#src/plinder-eval/plinder/eval/docking/README.md) docs for more details.
 
 ## 📦 Dataloader
 
 Dataloader is currently under construction.
 
-## ℹ️  Filters & Annotations
+## ℹ️ Filters & Annotations
 
 See the [`plinder.data`](#src/plinder-data/plinder/data/README.md) docs for more details.
 
@@ -135,7 +149,6 @@ We are currently working on the following:
 - Establishing a leaderboard
 - Improving the documentation and examples
 
-
 # 👨💻 Code organization
 
 This code is split into 4 sub-packages
@@ -147,16 +160,15 @@ This code is split into 4 sub-packages
 
 # 💽 Dataset Generation
 
-![Workflow](./assets/workflow.png)
+![workflow](https://github.com/user-attachments/assets/cde72643-5fdf-4998-8719-216d0cef2706)
 
 See the [End-to-end pipeline](#src/plinder-data/README.md) description for technical details about the dataset generation.
 
-
 # 📝 Examples & documentation
 
 Package documentation, including API documentation, [example notebooks](examples/), and supplementary guides, are made available.
 
-# ⚙️  Dev guide
+# ⚙️ Dev guide
 
 To develop and test changes to the source code, please use a development installation:
 
@@ -221,3 +233,8 @@ since the previous release:
 - If `bumpversion patch` is present in the commit message (or nothing is found), the patch version will be bumped
 
 **NOTE**: The CI workflow will use the __most recent__ match in the commit history to make its decision.
+
+# 📃 Publications
+Durairaj, Janani, Yusuf Adeshina, Zhonglin Cao, Xuejin Zhang, Vladas Oleinikovas, Thomas Duignan, Zachary McClure, Xavier Robin, Emanuele Rossi, Guoqing Zhou, Srimukh Prasad Veccham, Clemens Isert, Yuxing Peng, Prabindh Sundareson, Mehmet Akdel, Gabriele Corso, Hannes Stärk, Zachary Wayne Carpenter, Michael M. Bronstein, Emine Kucukbenli, Torsten Schwede, Luca Naef. 2024. “PLINDER: The Protein-Ligand Interactions Dataset and Evaluation Resource.”
+[bioRxiv](https://doi.org/10.1101/2024.07.17.603955)
+[ICML'24 ML4LMS](https://openreview.net/forum?id=7UvbaTrNbP)
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)