Skip to content

Commit d916015

Browse files
committed
Add monodepth2 model
1 parent 0f12e8f commit d916015

18 files changed

+396
-93
lines changed

README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,23 @@ The setup and interface of the models is explained in [rmvd/models/README.md](rm
4242
### Evaluation script
4343
Evaluation is done with the script `eval.py`, for example on ETH3D:
4444
```bash
45-
python eval.py --model robust_mvd --dataset eth3d --eval_type mvd --input poses intrinsics --output /tmp/eval_output --input_size 768 1152
45+
python eval.py --model robust_mvd --dataset eth3d --eval_type mvd --inputs poses intrinsics --output /tmp/eval_output --input_size 768 1152
4646
```
4747
On KITTI:
4848
```bash
49-
python eval.py --model robust_mvd --dataset kitti --eval_type mvd --input poses intrinsics --output /tmp/eval_output --input_size 384 1280
49+
python eval.py --model robust_mvd --dataset kitti --eval_type mvd --inputs poses intrinsics --output /tmp/eval_output --input_size 384 1280
5050
```
5151
On DTU:
5252
```bash
53-
python eval.py --model robust_mvd --dataset dtu --eval_type mvd --input poses intrinsics --output /tmp/eval_output --input_size 896 1216
53+
python eval.py --model robust_mvd --dataset dtu --eval_type mvd --inputs poses intrinsics --output /tmp/eval_output --input_size 896 1216
5454
```
5555
On ScanNet:
5656
```bash
57-
python eval.py --model robust_mvd --dataset scannet --eval_type mvd --input poses intrinsics --output /tmp/eval_output --input_size 448 640
57+
python eval.py --model robust_mvd --dataset scannet --eval_type mvd --inputs poses intrinsics --output /tmp/eval_output --input_size 448 640
5858
```
5959
On Tanks and Temples:
6060
```bash
61-
python eval.py --model robust_mvd --dataset tanks_and_temples --eval_type mvd --input poses intrinsics --output /tmp/eval_output --input_size 704 1280
61+
python eval.py --model robust_mvd --dataset tanks_and_temples --eval_type mvd --inputs poses intrinsics --output /tmp/eval_output --input_size 704 1280
6262
```
6363

6464
The parameters `model`, `dataset` and `eval_type` are required.
@@ -133,7 +133,7 @@ The following describes how to evaluate on the benchmark.
133133
### Evaluation of models within the `rmvd` framework
134134
Evaluation on the benchmark is done with the script `eval.py`:
135135
```bash
136-
python eval.py --model robust_mvd --eval_type robustmvd --input poses intrinsics --output /tmp/eval_benchmark --eth3d_size 768 1152 --kitti_size 384 1280 --dtu_size 896 1216 --scannet_size 448 640 --tanks_and_temples_size 704 1280
136+
python eval.py --model robust_mvd --eval_type robustmvd --inputs poses intrinsics --output /tmp/eval_benchmark --eth3d_size 768 1152 --kitti_size 384 1280 --dtu_size 896 1216 --scannet_size 448 640 --tanks_and_temples_size 704 1280
137137
```
138138

139139
### Programmatic evaluation
@@ -171,9 +171,9 @@ format and to call the model. For details about these functions, see [rmvd/model
171171

172172
## Citation
173173
This is the official repository for the publication:
174-
> **A Benchmark and a Baseline for Robust Multi-view Depth Estimation**
174+
> **[A Benchmark and a Baseline for Robust Multi-view Depth Estimation](http://arxiv.org/abs/2209.06681)**
175175
>
176-
> [Philipp Schröppel](https://lmb.informatik.uni-freiburg.de/people/schroepp), [Jan Bechtold](https://lmb.informatik.uni-freiburg.de/people/bechtolj), [Artemij Amiranashvili](https://lmb.informatik.uni-freiburg.de/people/amiranas) and [Thomas Brox](https://lmb.informatik.uni-freiburg.de/people/brox)
176+
> [Philipp Schröppel](https://lmb.informatik.uni-freiburg.de/people/schroepp), [Jan Bechtold](https://lmb.informatik.uni-freiburg.de/people/bechtolj), [Artemij Amiranashvili](https://lmb.informatik.uni-freiburg.de/people/amiranas), [Thomas Brox](https://lmb.informatik.uni-freiburg.de/people/brox)
177177
>
178178
> **3DV 2022**
179179

eval.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
import argparse
24
import sys
35
import os.path as osp
@@ -36,6 +38,10 @@ def eval(args):
3638
eval = create_evaluation(evaluation_type=args.eval_type,
3739
out_dir=args.output,
3840
inputs=args.inputs,
41+
alignment=args.alignment,
42+
view_ordering=args.view_ordering,
43+
min_source_views=args.min_source_views,
44+
max_source_views=args.max_source_views,
3945
eval_uncertainty=args.eval_uncertainty)
4046

4147
with open(osp.join(args.output, "cmd.txt"), 'a') as f:
@@ -55,19 +61,29 @@ def eval(args):
5561
parser.add_argument('--weights', help="Path to weights of the model. Optional. If None, default weights are used.")
5662
parser.add_argument('--num_gpus', type=int, help="Number of GPUs. 0 means use CPU. Default: use 1 GPU.", default=1)
5763
parser.add_argument('--eval_type', help=f"Evaluation setting. Options are: {', '.join(list_evaluations())}")
58-
parser.add_argument('--input', nargs='*',
64+
parser.add_argument('--inputs', nargs='*',
5965
help=f"Model inputs. Images are always provided to the model. "
6066
f"It is possible to specify multiple additional inputs, "
61-
f"e.g. --input intrinsics --input poses. "
67+
f"e.g. --inputs intrinsics poses. "
6268
f"Options for additional model inputs are: intrinsics, poses, depth_range.",
63-
type=str, dest='inputs')
69+
type=str)
6470
parser.add_argument('--output', help="Path to folder for output data.")
6571

6672
parser.add_argument('--num_samples', type=int, help='Number of samples to be evaluated. Default: evaluate all.')
6773
parser.add_argument('--samples', type=int, nargs='*',
6874
help='Index of sample that should be evaluated. Ignored if num_samples is used. '
6975
'Default: evaluate all.')
7076

77+
parser.add_argument('--max_source_views', type=int, help='Maximum number of source views to use for evaluation. '
78+
'Default: use all available source views.')
79+
parser.add_argument('--min_source_views', type=int, default=1,
80+
help='Minimum number of source views to use for evaluation. Default: 1.')
81+
parser.add_argument('--view_ordering', default="quasi-optimal",
82+
help=f"Source view ordering. Options are: quasi-optimal (default), nearest.")
83+
parser.add_argument('--alignment',
84+
help=f"Alignment between predicted and ground truth depths. "
85+
f"Options are None, median, translation. Default: None")
86+
7187
parser.add_argument('--num_qualitatives', type=int, default=10,
7288
help='Number of qualitatives to be output. Negative values output all qualitatives. '
7389
'Ignored if --qualitative is used. Default: 10.')

eval_all.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
python eval.py --model robust_mvd --eval_type robustmvd --inputs poses intrinsics --output /tmp/eval_benchmark --eth3d_size 768 1152 --kitti_size 384 1280 --dtu_size 896 1216 --scannet_size 448 640 --tanks_and_temples_size 704 1280
4+
python eval.py --model robust_mvd_5M --eval_type robustmvd --inputs poses intrinsics --output /tmp/eval_benchmark --eth3d_size 768 1152 --kitti_size 384 1280 --dtu_size 896 1216 --scannet_size 448 640 --tanks_and_temples_size 704 1280
5+
python eval.py --model monodepth2_mono_stereo_1024x320_wrapped --eval_type robustmvd --output /tmp/eval_benchmark --max_source_views 0 --alignment median
6+
python eval.py --model monodepth2_mono_stereo_640x192_wrapped --eval_type robustmvd --output /tmp/eval_benchmark --max_source_views 0 --alignment median

inference.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
import argparse
24
import os
35
import os.path as osp

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
torch>=1.9.0
2+
torchvision
23
numpy
34
pillow
45
matplotlib

rmvd/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
from .data import list_datasets, list_base_datasets, list_dataset_types, list_splits, has_dataset, create_dataset, \
44
create_compound_dataset
55

6-
from .models import list_models, has_model, create_model
6+
from .models import list_models, has_model, create_model, prepare_custom_model
77
from .eval import list_evaluations, create_evaluation
88
from .train import list_trainings, create_training

rmvd/data/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ target directory to download the dataset:
2525
```bash
2626
./scripts/download_eth3d.sh /path/to/eth3d
2727
```
28-
Then specify the download directory (`/path/to/eth3d`) in the `paths.toml` file.
28+
Then specify the download directory `/path/to/eth3d` in the `paths.toml` file.
2929

3030
### KITTI
3131
Download the KITTI raw data from <https://www.cvlibs.net/datasets/kitti/raw_data.php> using

rmvd/data/transforms.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ def __call__(self, sample):
3232
sample["images"] = images
3333

3434
# resize intrinsics:
35-
scale_arr = np.array([[wd / orig_wd]*3, [ht / orig_ht]*3, [1.]*3], dtype=np.float32) # 3, 3
36-
sample["intrinsics"] = [intrinsic * scale_arr for intrinsic in sample["intrinsics"]]
35+
if "intrinsics" in sample:
36+
scale_arr = np.array([[wd / orig_wd]*3, [ht / orig_ht]*3, [1.]*3], dtype=np.float32) # 3, 3
37+
sample["intrinsics"] = [intrinsic * scale_arr for intrinsic in sample["intrinsics"]]
3738

3839
sample["orig_width"] = orig_wd
3940
sample["orig_height"] = orig_ht

rmvd/eval/multi_view_depth_evaluation.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
from .metrics import m_rel_ae, pointwise_rel_ae, thresh_inliers, sparsification
1515

1616

17-
# TODO: add tensorboard logging
18-
19-
2017
class MultiViewDepthEvaluation:
2118
"""Multi-view depth evaluation.
2219
@@ -30,6 +27,9 @@ class MultiViewDepthEvaluation:
3027
A typical depth-from-video model would set
3128
inputs=["images", "intrinsics"], alignment="median".
3229
30+
A typical depth-from-single-view model would set
31+
inputs=["images"], max_source_views=0, alignment="median".
32+
3333
Args:
3434
out_dir: Directory where results will be written. If None, results are not written to disk.
3535
inputs: List of input modalities that are supplied to the algorithm.
@@ -39,9 +39,12 @@ class MultiViewDepthEvaluation:
3939
None evaluates predictions without any alignment.
4040
"median" scales predicted depth maps with the ratio of medians of predicted and ground truth depth maps.
4141
"translation" scales predicted depth maps with the ratio of the predicted and ground truth translation.
42+
max_source_views: Maximum number of source views to be considered. None means all available source views are
43+
considered. Default: None.
44+
min_source_views. Minimum number of source views provided to the model.
45+
If max_source_views is not None, is set to min(min_source_views, max_source_views). Default: 1.
4246
view_ordering: Ordering of source views during the evaluation.
43-
Options are "quasi-optimal", "nearest" and None. Default: "quasi-optimal".
44-
None: supply all source views to the model and evaluate predicted depth map.
47+
Options are "quasi-optimal" and "nearest". Default: "quasi-optimal".
4548
"quasi-optimal": evaluate predicted depth maps for all (keyview, sourceview) pairs.
4649
Order source views according to the prediction accuracy. Increase source view set based on
4750
the obtained ordering and re-evaluate for each additional source view.
@@ -50,8 +53,6 @@ class MultiViewDepthEvaluation:
5053
view set based on the ordering of views in the sample, i.e. based on the distance between source
5154
view indices and the keyview index. Log results based on the number of source views.
5255
Log best results as overall results.
53-
max_source_views: Maximum number of source views to be considered in case view_ordering is
54-
"quasi-optimal" or "nearest". None means all available source views are considered.
5556
eval_uncertainty: Evaluate predicted uncertainty (pred_depth_uncertainty) if available.
5657
Increases evaluation time.
5758
clip_pred_depth: Clip model predictions before evaluation to a reasonable range. This makes sense to reduce
@@ -64,12 +65,14 @@ def __init__(self,
6465
out_dir: Optional[str] = None,
6566
inputs: Sequence[str] = None,
6667
alignment: Optional[str] = None,
67-
view_ordering: str = "quasi-optimal",
6868
max_source_views: Optional[int] = None,
69+
min_source_views: int = 1,
70+
view_ordering: str = "quasi-optimal",
6971
eval_uncertainty: bool = True,
7072
clip_pred_depth: Union[bool, Tuple[float, float]] = True,
7173
sparse_pred: bool = False,
7274
verbose: bool = True,
75+
**_
7376
):
7477

7578
self.verbose = verbose
@@ -89,8 +92,9 @@ def __init__(self,
8992

9093
self.inputs = list(set(inputs + ["images"])) if inputs is not None else ["images"]
9194
self.alignment = alignment
92-
self.view_ordering = view_ordering
9395
self.max_source_views = max_source_views
96+
self.min_source_views = min_source_views if max_source_views is None else min(min_source_views, max_source_views)
97+
self.view_ordering = view_ordering if (self.max_source_views is None) or (self.max_source_views > 0) else None
9498
self.eval_uncertainty = eval_uncertainty
9599
self.clip_pred_depth = clip_pred_depth
96100
self.sparse_pred = sparse_pred
@@ -120,8 +124,9 @@ def __str__(self):
120124
ret = f"{self.name} with settings:"
121125
ret += f"\n\tInputs: {self.inputs}"
122126
ret += f"\n\tAlignment: {self.alignment}"
123-
ret += f"\n\tView ordering: {self.view_ordering}"
127+
ret += f"\n\tMin source views: {self.min_source_views}"
124128
ret += f"\n\tMax source views: {self.max_source_views}"
129+
ret += f"\n\tView ordering: {self.view_ordering}"
125130
ret += f"\n\tEvaluate uncertainty: {self.eval_uncertainty}"
126131
ret += f"\n\tClip predicted depth: {self.clip_pred_depth}"
127132
ret += f"\n\tPredicted depth is sparse: {self.sparse_pred}"
@@ -222,7 +227,7 @@ def _evaluate(self):
222227
ordered_source_indices = self._get_source_view_ordering(sample_inputs=sample_inputs, sample_gt=sample_gt)
223228
max_source_views = min(len(ordered_source_indices), self.max_source_views) \
224229
if self.max_source_views is not None else len(ordered_source_indices)
225-
min_source_views = 1 if self.view_ordering is not None else max_source_views
230+
min_source_views = self.min_source_views
226231

227232
best_metrics = None
228233
best_num_source_views = np.nan
@@ -234,7 +239,7 @@ def _evaluate(self):
234239
cur_keyview_idx = cur_view_indices.index(keyview_idx)
235240

236241
if self.verbose:
237-
print(f"\tEvaluating with {num_source_views} / {len(ordered_source_indices)} source views:")
242+
print(f"\tEvaluating with {num_source_views} / {max_source_views} source views:")
238243
print(f"\t\tSource view indices: {cur_source_indices}.")
239244

240245
self._reset_memory_stats()
@@ -371,7 +376,7 @@ def _init_results(self):
371376
def _get_source_view_ordering(self, sample_inputs, sample_gt):
372377
if self.view_ordering == 'quasi-optimal':
373378
return self._get_quasi_optimal_source_view_ordering(sample_inputs=sample_inputs, sample_gt=sample_gt)
374-
else:
379+
elif (self.view_ordering == 'nearest') or (self.view_ordering is None):
375380
return self._get_nearest_source_view_ordering(sample_inputs=sample_inputs, sample_gt=sample_gt)
376381

377382
def _get_nearest_source_view_ordering(self, sample_inputs, sample_gt):
@@ -389,13 +394,17 @@ def _get_quasi_optimal_source_view_ordering(self, sample_inputs, sample_gt):
389394
# construct temporary sample with a single source view:
390395
cur_sample_inputs = deepcopy(sample_inputs)
391396
cur_sample_gt = deepcopy(sample_gt)
392-
cur_sample_inputs['images'] = [cur_sample_inputs['images'][keyview_idx],
393-
cur_sample_inputs['images'][source_idx]]
394-
cur_sample_inputs['poses'] = [cur_sample_inputs['poses'][keyview_idx],
395-
cur_sample_inputs['poses'][source_idx]]
396-
cur_sample_inputs['intrinsics'] = [cur_sample_inputs['intrinsics'][keyview_idx],
397-
cur_sample_inputs['intrinsics'][source_idx]]
397+
if "images" in self.inputs:
398+
cur_sample_inputs['images'] = [cur_sample_inputs['images'][keyview_idx],
399+
cur_sample_inputs['images'][source_idx]]
400+
if "poses" in self.inputs:
401+
cur_sample_inputs['poses'] = [cur_sample_inputs['poses'][keyview_idx],
402+
cur_sample_inputs['poses'][source_idx]]
403+
if "intrinsics" in self.inputs:
404+
cur_sample_inputs['intrinsics'] = [cur_sample_inputs['intrinsics'][keyview_idx],
405+
cur_sample_inputs['intrinsics'][source_idx]]
398406
cur_sample_inputs['keyview_idx'] = np.array([0])
407+
# depth_range is not changed
399408

400409
# run model:
401410
pred, _, _ = self._run_model(cur_sample_inputs)

rmvd/eval/robust_mvd_benchmark.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@ class RobustMultiViewDepthBenchmark:
3232
None evaluates predictions without any alignment.
3333
"median" scales predicted depth maps with the ratio of medians of predicted and ground truth depth maps.
3434
"translation" scales predicted depth maps with the ratio of the predicted and ground truth translation.
35-
max_source_views: Maximum number of source views to be considered in case view_ordering is
36-
"quasi-optimal" or "nearest". None means all available source views are considered.
35+
max_source_views: Maximum number of source views to be considered. None means all available source views are
36+
considered. Default: None.
37+
min_source_views. Minimum number of source views provided to the model.
38+
If max_source_views is not None, is set to min(min_source_views, max_source_views). Default: 1.
3739
eval_uncertainty: Evaluate predicted uncertainty (pred_depth_uncertainty) if available.
3840
Increases evaluation time.
3941
sparse_pred: Predicted depth is sparse. Invalid predictions are indicated by 0 values and ignored in
@@ -45,9 +47,11 @@ def __init__(self,
4547
inputs: Sequence[str] = None,
4648
alignment: Optional[str] = None,
4749
max_source_views: Optional[int] = None,
50+
min_source_views: int = 1,
4851
eval_uncertainty: bool = True,
4952
sparse_pred: bool = False,
5053
verbose: bool = True,
54+
**_
5155
):
5256

5357
self.verbose = verbose
@@ -59,9 +63,10 @@ def __init__(self,
5963
if self.out_dir is not None:
6064
os.makedirs(self.out_dir, exist_ok=True)
6165

62-
self.inputs = inputs
66+
self.inputs = list(set(inputs + ["images"])) if inputs is not None else ["images"]
6367
self.alignment = alignment
6468
self.max_source_views = max_source_views
69+
self.min_source_views = min_source_views if max_source_views is None else min(min_source_views, max_source_views)
6570
self.eval_uncertainty = eval_uncertainty
6671
self.sparse_pred = sparse_pred
6772

@@ -79,6 +84,7 @@ def __str__(self):
7984
ret += f"\n\tInputs: {self.inputs}"
8085
ret += f"\n\tAlignment: {self.alignment}"
8186
ret += f"\n\tMax source views: {self.max_source_views}"
87+
ret += f"\n\tMin source views: {self.min_source_views}"
8288
ret += f"\n\tEvaluate uncertainty: {self.eval_uncertainty}"
8389
ret += f"\n\tPredicted depth is sparse: {self.sparse_pred}"
8490
if self.out_dir is not None:
@@ -145,6 +151,7 @@ def __call__(self,
145151

146152
eval = MultiViewDepthEvaluation(out_dir=out_dir, inputs=self.inputs, alignment=self.alignment,
147153
view_ordering="quasi-optimal", max_source_views=self.max_source_views,
154+
min_source_views=self.min_source_views,
148155
eval_uncertainty=self.eval_uncertainty, clip_pred_depth=True,
149156
sparse_pred=self.sparse_pred, verbose=self.verbose)
150157
# TODO: pass tqdm progress bar and set verbose to False

0 commit comments

Comments
 (0)