Skip to content

Commit 7f4a57a

Browse files
authored
Plinder Data Loader (#15)
* feat: plinder_download command with a progress bar * chore: wip * chore: clean up rough edges around zips for entries * fix: more root pathing issues * chore: work down tech debt in pipeline config up to make_entries * fix: download binding db * fix: refs to ingest config * chore: work down more tech debt in ingest pipeline * test: end to end test runs up to make_mmp_index * test: index utils * test: query index * feat: plinder loader gets linked systems * feat: naive cross similarity * feat: ligand cross similarity * chore: migrate loader as is from internal * wip: loader POC * docs: plinder[loader] block * feat: atom3d loader for system cif * style: linters * style: mypy * test: update system and scores tests * fix: when logs throw exceptions * style: ruff * fix: entry now forces path * fix: parsing stored entry jsons * feat: create index as a utility * feat: wip * fix: defer system edge cases to later * chore: lazy load entry and separate from system * fix: gracefully create index when no clusters exist * fix: instrument scoring modalities * fix: rerun scoring * chore: bring back collate partitions * fix: check checkpointing in scatter * fix: typo * chore: wip * fix: mmp * feat: write out linked structures * fix: pls apo * fix: mp get context, del links before pool, simple map * chore: bump flow * fix: skip versioning because of docs branch * fix: apo/pred runs * chore: flow bump * style: ruff * fix: memory usage in apo/pred * chore: support custom script execution in docker run * feat: loader supports linked structures * test: separate links from linked_structures * chore: simplify cloudpathlib access for plinder offline * style: mypy * style: ruff * test: minimal data loader test * fix: registry env * feat: split utils * fix: no extra filename * fix: aftermath of simplifying get_plinder_path * fix: shuttle tox env through docker * chore: tox env * style: ruff * test: resolve query test failures and add some more * style: ruff * feat: dataset loads split not index * feat: filter dataset by split * test: stability in the absence of gcs * fix: revert to 2024-04/v1
1 parent 7a320a4 commit 7f4a57a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+2017
-1125
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,4 +189,5 @@ src/plinder-data/plinder/data/artifacts
189189
*.bak*
190190
*.1.*
191191
tests/xx
192+
tests/test_data/plinder/mount/systems/
192193
artifacts

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,12 @@ See the [`plinder.eval`](docs/eval/README.md) docs for more details.
185185

186186
## 📦 Dataloader
187187

188-
Dataloader is currently under construction.
188+
The `plinder.data.loader` package contains a `PyTorch` dataloader for the dataset using the `atom3d` format. It is an example of using the `plinder.core` API
189+
to implement a dataloader, but is not the only way to use the dataset.
190+
191+
**Note**: The dataloader requires both `torch` and `atom3d` to be installed. You use the `[loader]` dependency block when installing `plinder`:
192+
193+
pip install .[loader]
189194

190195
## ℹ️ Filters & Annotations
191196

flows/data_ingest.py

Lines changed: 91 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
MOUNT = "/plinder"
1313
K8S = dict(
1414
cpu=1,
15-
image="ghcr.io/plinder-org/plinder:v0.1.1",
15+
image="us-east1-docker.pkg.dev/vantai-analysis/metaflow/plinder:v0.1.3-50-g95faf3ec-dirty",
1616
node_selector={
1717
"topology.kubernetes.io/zone": "us-east1-b",
1818
},
@@ -22,7 +22,7 @@
2222
)
2323
ENV = dict(
2424
vars=dict(
25-
PLINDER_MOUNT="",
25+
PLINDER_MOUNT=MOUNT,
2626
PLINDER_RELEASE="2024-06",
2727
PLINDER_ITERATION="",
2828
)
@@ -31,8 +31,8 @@
3131
WORKSTATION = dict(cpu=14, memory=14000)
3232
WORKSTATION_MEM = dict(cpu=5, memory=48000)
3333
LARGE_MEM = dict(
34-
cpu=1.75,
35-
memory=95000,
34+
cpu=7,
35+
memory=380000,
3636
tolerations=[
3737
dict(
3838
effect="NoSchedule",
@@ -61,9 +61,38 @@ def start(self):
6161
print(f"started data ingest run with config: {self.config_file}")
6262
contents = gcs.download_as_str(gcs_path=self.config_file, bucket_name="plinder-collab-bucket")
6363
self.pipeline = IngestPipeline(conf=get_config(config_contents=contents))
64-
self.next(self.scatter_make_components_and_communities)
65-
# self.next(self.scatter_download_rcsb_files)
64+
# self.next(self.scatter_download_rcsb_files)
65+
# self.next(self.scatter_make_entries)
66+
# self.next(self.scatter_structure_qc)
67+
# self.next(self.scatter_collate_partitions)
68+
# self.next(self.scatter_make_components_and_communities)
69+
self.next(self.assign_apo_pred_systems)
70+
71+
# @kubernetes(**{**K8S, **LARGE_MEM, **{"cpu": 3.5, "memory": 175000}})
72+
# @environment(**ENV)
73+
# @retry
74+
# @step
75+
# def scatter_collate_partitions(self):
76+
# self.chunks = self.pipeline.scatter_collate_partitions()
77+
# self.next(self.collate_partitions, foreach="chunks")
78+
#
79+
# @kubernetes(**K8S)
80+
# @environment(**ENV)
81+
# @retry
82+
# @step
83+
# def collate_partitions(self):
84+
# self.pipeline.collate_partitions(self.input)
85+
# self.next(self.join_collate_partitions)
6686
#
87+
# @kubernetes(**K8S)
88+
# @environment(**ENV)
89+
# @retry
90+
# @step
91+
# def join_collate_partitions(self, inputs):
92+
# self.pipeline = inputs[0].pipeline
93+
# self.merge_artifacts(inputs, exclude=["chunks"])
94+
# self.next(self.make_mmp_index)
95+
6796
# @kubernetes(**K8S)
6897
# @environment(**ENV)
6998
# @retry
@@ -85,6 +114,7 @@ def start(self):
85114
# @retry
86115
# @step
87116
# def join_download_rcsb_files(self, inputs):
117+
# self.pipeline = inputs[0].pipeline
88118
# self.merge_artifacts(inputs, exclude=["chunks"])
89119
# self.next(self.download_alternative_datasets)
90120
#
@@ -103,7 +133,7 @@ def start(self):
103133
# def make_dbs(self):
104134
# self.pipeline.make_dbs()
105135
# self.next(self.scatter_make_entries)
106-
#
136+
107137
# @kubernetes(**K8S)
108138
# @environment(**ENV)
109139
# @retry
@@ -117,7 +147,7 @@ def start(self):
117147
# @retry
118148
# @step
119149
# def make_entries(self):
120-
# self.pipeline.cfg.scatter.make_entries_cpu = WORKSTATION["cpu"]
150+
# self.pipeline.cfg.flow.make_entries_cpu = WORKSTATION["cpu"]
121151
# self.reruns = self.pipeline.make_entries(self.input)
122152
# self.next(self.join_make_entries)
123153
#
@@ -126,6 +156,7 @@ def start(self):
126156
# @retry
127157
# @step
128158
# def join_make_entries(self, inputs):
159+
# self.pipeline = inputs[0].pipeline
129160
# self.merge_artifacts(inputs, exclude=["chunks", "reruns"])
130161
# self.rerun = self.pipeline.join_make_entries([inp.reruns for inp in inputs])
131162
# self.next(self.scatter_make_entries_second_try)
@@ -135,8 +166,8 @@ def start(self):
135166
# @retry
136167
# @step
137168
# def scatter_make_entries_second_try(self):
138-
# self.original_pdb_ids = self.pipeline.cfg.scatter.pdb_ids
139-
# self.pipeline.cfg.scatter.pdb_ids = self.rerun
169+
# self.original_pdb_ids = self.pipeline.cfg.context.pdb_ids
170+
# self.pipeline.cfg.context.pdb_ids = self.rerun
140171
# self.chunks = self.pipeline.scatter_make_entries()
141172
# self.next(self.make_entries_second_try, foreach="chunks")
142173
#
@@ -145,7 +176,7 @@ def start(self):
145176
# @retry
146177
# @step
147178
# def make_entries_second_try(self):
148-
# self.pipeline.cfg.scatter.make_entries_cpu = WORKSTATION_MEM["cpu"]
179+
# self.pipeline.cfg.flow.make_entries_cpu = WORKSTATION_MEM["cpu"]
149180
# self.pipeline.make_entries(self.input)
150181
# self.next(self.join_make_entries_second_try)
151182
#
@@ -154,8 +185,9 @@ def start(self):
154185
# @retry
155186
# @step
156187
# def join_make_entries_second_try(self, inputs):
188+
# self.pipeline = inputs[0].pipeline
157189
# self.merge_artifacts(inputs, exclude=["chunks", "reruns"])
158-
# self.pipeline.cfg.scatter.pdb_ids = self.original_pdb_ids
190+
# self.pipeline.cfg.context.pdb_ids = self.original_pdb_ids
159191
# self.next(self.scatter_structure_qc)
160192
#
161193
# @kubernetes(**K8S)
@@ -179,7 +211,9 @@ def start(self):
179211
# @retry
180212
# @step
181213
# def join_structure_qc(self, inputs):
214+
# self.pipeline = inputs[0].pipeline
182215
# self.merge_artifacts(inputs, exclude=["chunks"])
216+
# self.pipeline.join_structure_qc()
183217
# self.next(self.scatter_make_system_archives)
184218
#
185219
# @kubernetes(**K8S)
@@ -203,6 +237,7 @@ def start(self):
203237
# @retry
204238
# @step
205239
# def join_make_system_archives(self, inputs):
240+
# self.pipeline = inputs[0].pipeline
206241
# self.merge_artifacts(inputs, exclude=["chunks"])
207242
# self.next(self.scatter_make_ligands)
208243
#
@@ -227,6 +262,7 @@ def start(self):
227262
# @retry
228263
# @step
229264
# def join_make_ligands(self, inputs):
265+
# self.pipeline = inputs[0].pipeline
230266
# self.merge_artifacts(inputs, exclude=["chunks"])
231267
# self.next(self.compute_ligand_fingerprints)
232268
#
@@ -259,6 +295,7 @@ def start(self):
259295
# @retry
260296
# @step
261297
# def join_make_ligand_scores(self, inputs):
298+
# self.pipeline = inputs[0].pipeline
262299
# self.merge_artifacts(inputs, exclude=["chunks"])
263300
# self.next(self.make_sub_dbs)
264301
#
@@ -291,6 +328,7 @@ def start(self):
291328
# @retry
292329
# @step
293330
# def join_run_batch_searches(self, inputs):
331+
# self.pipeline = inputs[0].pipeline
294332
# self.merge_artifacts(inputs, exclude=["chunks"])
295333
# self.next(self.scatter_make_batch_scores)
296334
#
@@ -315,40 +353,51 @@ def start(self):
315353
# @retry
316354
# @step
317355
# def join_make_batch_scores(self, inputs):
356+
# self.pipeline = inputs[0].pipeline
318357
# self.merge_artifacts(inputs, exclude=["chunks"])
319-
# self.next(self.scatter_make_components_and_communities)
320-
321-
@kubernetes(**K8S)
322-
@environment(**ENV)
323-
@retry
324-
@step
325-
def scatter_make_components_and_communities(self):
326-
self.chunks = self.pipeline.scatter_make_components_and_communities()
327-
self.next(self.make_components_and_communities, foreach="chunks")
328-
329-
@kubernetes(**{**K8S, **LARGE_MEM})
330-
@environment(**ENV)
331-
@retry
332-
@step
333-
def make_components_and_communities(self):
334-
self.pipeline.make_components_and_communities(self.input)
335-
self.next(self.join_make_components_and_communities)
336-
337-
@kubernetes(**K8S)
338-
@environment(**ENV)
339-
@retry
340-
@step
341-
def join_make_components_and_communities(self, inputs):
342-
self.pipeline = inputs[0].pipeline
343-
self.merge_artifacts(inputs, exclude=["chunks"])
344-
self.next(self.make_mmp_index)
345-
346-
@kubernetes(**{**K8S, **WORKSTATION})
358+
# # self.next(self.scatter_make_components_and_communities)
359+
# self.next(self.make_mmp_index) # scatter_make_components_and_communities)
360+
#
361+
# @kubernetes(**K8S)
362+
# @environment(**ENV)
363+
# @retry
364+
# @step
365+
# def scatter_make_components_and_communities(self):
366+
# self.chunks = self.pipeline.scatter_make_components_and_communities()
367+
# self.next(self.make_components_and_communities, foreach="chunks")
368+
#
369+
# @kubernetes(**{**K8S, **LARGE_MEM})
370+
# @environment(**ENV)
371+
# @retry
372+
# @step
373+
# def make_components_and_communities(self):
374+
# self.pipeline.make_components_and_communities(self.input)
375+
# self.next(self.join_make_components_and_communities)
376+
#
377+
# @kubernetes(**K8S)
378+
# @environment(**ENV)
379+
# @retry
380+
# @step
381+
# def join_make_components_and_communities(self, inputs):
382+
# self.pipeline = inputs[0].pipeline
383+
# self.merge_artifacts(inputs, exclude=["chunks"])
384+
# self.next(self.make_mmp_index)
385+
#
386+
# @kubernetes(**{**K8S, **WORKSTATION})
387+
# @environment(**ENV)
388+
# @retry
389+
# @step
390+
# def make_mmp_index(self):
391+
# self.pipeline.make_mmp_index()
392+
# self.next(self.assign_apo_pred_systems)
393+
#
394+
@kubernetes(**{**K8S, **DATABASES})
347395
@environment(**ENV)
348396
@retry
349397
@step
350-
def make_mmp_index(self):
351-
self.pipeline.make_mmp_index()
398+
def assign_apo_pred_systems(self):
399+
self.pipeline.cfg.flow.assign_apo_pred_systems_cpus = DATABASES["cpu"]
400+
self.pipeline.assign_apo_pred_systems()
352401
self.next(self.end)
353402

354403
@kubernetes(**K8S)

flows/docker.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def get_version_bump(base_tag: str | None = None) -> str:
7373
),
7474
text=True,
7575
).strip()
76-
if log:
76+
if log and "origin/main" in log:
7777
bump = token.split()[1]
7878
break
7979
if bump == "skip":
@@ -217,13 +217,16 @@ def build_image(tag: str | None = None, push: bool = False) -> str:
217217
env = get_env(tag)
218218
image = f"{env['IMAGE_REPO']}/plinder"
219219
build_tag = env["BUILD_TAG"]
220+
registry = environ.get("PLINDER_REGISTRY", "/".join(get_image().split("/")[:-1]))
220221
cmd = [
221222
docker,
222223
"build",
223224
"-f",
224225
"dockerfiles/main/Dockerfile",
225226
"-t",
226227
f"{env['IMAGE_REPO']}/plinder:{build_tag}",
228+
"-t",
229+
f"{registry}/plinder:{build_tag}",
227230
"--secret",
228231
"id=INDEX_URL",
229232
"--build-arg",
@@ -236,7 +239,7 @@ def build_image(tag: str | None = None, push: bool = False) -> str:
236239
]
237240
Proc(cmd, env=env).execute()
238241
if push:
239-
cmd = [docker, "compose", "push", "plinder", "--quiet"]
242+
cmd = [docker, "push", f"{registry}/plinder:{build_tag}"]
240243
Proc(cmd, env=env).execute()
241244
return f"{image}:{build_tag}"
242245

@@ -245,6 +248,7 @@ def test_image(
245248
tag: str,
246249
push: bool = False,
247250
args: Optional[List[str]] = None,
251+
dirty: bool = False,
248252
) -> None:
249253
"""
250254
Run the test service from docker compose. Optionally
@@ -259,10 +263,15 @@ def test_image(
259263
if True, push images to the artifact registry
260264
args : List[str], default=None
261265
the arguments to pass to the image
266+
dirty : bool, default=False
267+
if True, mount the current working tree
262268
"""
263269
env = get_env(tag)
264270
docker = get_docker()
265-
cmd = [docker, "compose", "run", "test"]
271+
cmd = [docker, "compose", "run", "-e", "PLINDER_OFFLINE=true"]
272+
if dirty:
273+
cmd.extend(["-v", f"{Path.cwd()}/src/plinder:/opt/conda/lib/python3.9/site-packages/plinder"])
274+
cmd.append("test")
266275
if args is not None and len(args):
267276
cmd.extend(
268277
split(f'''/bin/bash -c "python -m pytest -v -n auto {' '.join(args)} && cp .coverage reports/.coverage"''')
@@ -277,7 +286,9 @@ def run_image(
277286
args: Optional[List[str]] = None,
278287
build: bool = False,
279288
tag: str | None = None,
289+
dirty: bool = False,
280290
it: bool = False,
291+
script: str | None = None,
281292
) -> None:
282293
"""
283294
Run the image mounting the current working tree. This can be
@@ -317,10 +328,12 @@ def run_image(
317328
"-v",
318329
f"{home}/.local/share/plinder:/plinder",
319330
"-v",
320-
f"{host}:{guest}",
331+
f"{host}/plinder:{guest}",
321332
"-v",
322333
f"{host}:{app}",
323334
]
335+
if script:
336+
cmd.extend(["-v", f"{host.parent}/{script}:{Path(app).parent}/{script}"])
324337
if it:
325338
import pty
326339

@@ -355,8 +368,10 @@ def main(argv: Optional[List[str]] = None):
355368
)
356369
build = subs.add_parser("build", help="Build the app image")
357370
test = subs.add_parser("test", help="Test the app image")
371+
test.add_argument("--dirty", default=False, action="store_true", help="Mount current working tree")
358372
run = subs.add_parser("run", help="Run the app image")
359373
run.add_argument("--it", default=False, action="store_true", help="Run in interactive mode")
374+
run.add_argument("--script", default="", help="Script to run")
360375
for sub in [build, test, run]:
361376
sub.add_argument(
362377
"--tag", default=None, help="The image tag to pass to build_image",
@@ -388,6 +403,9 @@ def main(argv: Optional[List[str]] = None):
388403
"run": run_image,
389404
}
390405
kwargs = {} if command == "bump" else nsargs
406+
if command is None:
407+
parser.print_help()
408+
exit()
391409
func[command](**kwargs)
392410

393411

0 commit comments

Comments
 (0)