DART-ID/example/config_annotated.yaml at master · SlavovLab/DART-ID · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
### DART-ID configuration
### =========================

# List the input files here, or define them on the command line
# when running the tool.
# for example, dart_id -i /path/to/input1.txt /path/to/input2.txt
input:
  - /path/to/dat/FP061A/evidence.txt
  - /path/to/dat/FP062ABCD/evidence.txt
  - /path/to/dat/FP063/evidence.txt
  - /path/to/dat/FP064AG/evidence.txt


# Folder to output to. The updated evidence file, as well as the
# optional parameters files and figures will be deposited here
# this can also be specified on the command line. e.g.:
# -o /path/to/output/folder
#output: /path/to/output/folder

# Print diagnostic figures, as well as an HTML file that allows
# for quick browsing
print_figures: true

## Input Type Options
## ==========================

# column names of the input file
# as of now all input files have to be the same format
# change these as the input file changes,
# e.g., when a different search engine or search engine configuration is used
col_names:
  # These four columns are required. This program will not work without them.

  # Sequence can be the canonical amino-acid sequence,
  # or the modified/annotated sequence, as provided by the search engine
  sequence: Modified sequence
  # The name of the raw/spectrum file, or a unique identifier for each
  # mass-spec run
  raw_file: Raw file
  # The retention/elution time of the ion, in minutes
  # This can also be in seconds, just make sure you update the priors in
  # model to reflect this change.
  retention_time: Retention time
  # The error probability of the peptide-spectrum-match. can be provided
  # by the search engine or by a separate program, e.g., Percolator
  pep: PEP

  # optional columns, that would be used for filtering or figure generation

  # Used to (optionally) append the ion charge state to the peptide sequence,
  # so that peptides with different charge states are treated as different
  # peptide species.
  charge: Charge

  # Used to run the Fido protein inference algorithm
  leading_protein: Leading razor protein
  proteins: Proteins

  # The base peak width, i.e., the time range between when an ion
  # first elutes to when it last elutes. Use this as a quality score
  # in order to filter out poorly retained ions.
  retention_length: Retention length

  # Unused

  #intensity: Intensity
  #leading_gene: ~
  #genes: ~
  #exclude: ~
  #exp_id: ~
  #peptide_id: ~

## PSM Filters
## ======================

# Filters are used to exclude certain observations (PSMs) from
# the alignment process.
# Remove/comment-out filters from this list that you do not want to have.
filters:
  # Filter out entire raw files, especially if they are of a different run-time,
  # or if the LC for that experiment was problematic. The "expr" field is a regular
  # expression that will be checked against all raw files in the input.
  #- name: exclude_filename
  #  expr: PS06[1-3][AB]|PS064F

  # Same as above, but as a whitelist instead of a blacklist
  #- name: include_filename
  #  expr: 2018A

  # Provide an exclusion list of UniProt IDs. Any PSM matching this
  # list will be filtered out
  # Either a file, with UniProt IDs separated by line breaks, can be
  # specified with the "file" field, or
  # a list of UniProt IDs can be provided in the "list" field
  #- name: uniprot_exclusion
  #  file: /path/to/list_of_uniprot_ids.txt
  #  list:
  #    - or_you_could
  #    - list_uniprot_ids_here
  #    - P36578
  #    - Q99797

  # Filter out contaminants as marked by the search engine
  # The "tag" option is the pattern used to filter out PSMs
  - name: contaminant
    tag: CON__

  # Filter out decoys as marked by the search engine
  # The "tag" option is the pattern used to filter out PSMs
  # - name: decoy
  #   tag: REV__

  # Filter out PSMs by the retention length, which is defined
  # by some search engines as the time at which this spectra is first
  # observed, to the time this spectra is last observed
  #
  # If "dynamic" is set to true, then the threshold is a fraction of
  # the maximum RT for that raw file (i.e., the run-time). A value of 0.01
  # denotes that the threshold will be 1% of the total run-time of the experiment.
  - name: retention_length
    dynamic: true
    value: 0.01667

  # Filter out PSMs by their RT ranges in each experiment. This behavior is
  # similar but not exactly the same as the "retention_length" filter.
  #
  # If "dynamic" is set to true, then the threshold is a fraction of
  # the maximum RT for that raw file (i.e., the run-time). A value of 0.01
  # denotes that the threshold will be 1% of the total run-time of the experiment.
  - name: smears
    dynamic: true
    value: 0.03333


### =======================
### !! ADVANCED SETTINGS !!
### =======================

# Only edit the following settings if you understand their effects
# Please refer to config_annotated.yaml for detailed descriptions for
# each configuration field

# Level of verbosity. Higher numbers = printing more information
# 0 = ERROR
# 1 = WARNING (default)
# 2 = INFO
# 3 = DEBUG
# verbose: 1


## Input
## ==========================

# Column delimiter of the input files. i.e., ',' for CSV, '\t' for tabular
# sep: \t

# The input data is loaded in with pandas, and it doesn't like
# some columns being mostly empty. This needs to be set to false
# for input formats like MaxQuant.
# low_memory: false

# Instead of running a new STAN alignment, use a set of parameters
# from a previous run. The folder needs to include the three files
# outputted from a run with the "save_params" option on, and this run
# needs to be run with the exact same filters as that previous run.
# (exp_params.txt, peptide_params.txt, pair_params.txt)
# params_folder: /path/to/output_folder_from_prev_run

## Alignment Options
## ==========================

# Which alignment model to use
# Options: 'linear', 'two_piece_linear', 'two_piece_linear_laplace'
# model: 'two_piece_linear_laplace'

# add charge of ion onto the sequence, so that sequences ionized
# with different charge states will be aligned separately.
#
# Sometimes peptide sequences will form chemical adducts on column
# that can reflect on the charge received by the peptide during the
# ioniziation process, and aligning differently charged peptides can
# account for these chromatographic changes
# add_charge_to_sequence: false

# Number of iterations to run when generating priors
# If the average error when generating priors is too high,
# or prohibitive for STAN, then increase these to get more accurate priors
# prior_iters: 10

# Number of iterations to run for STAN. If STAN is consistently hitting
# its iteration limit without reaching an optima it is happy with,
# then increase this number
# stan_iters: 20000

## Advanced Alignment Options

# Minimum value for mu, a canonical retention time (RT) for a peptide
# mu_min: 1

# Amount to distort RTs when calculating priors. If STAN is erroring out
# because the priors are already too close to the optima, then consider
# slowly increasing this value to give STAN more room to iterate.
# rt_distortion: 0


# Advanced STAN parameters (with cmdstan: https://mc-stan.org/users/interfaces/cmdstan),
# for the LBFGS optimization algorithm
# we recommend leaving these at their defaults.

# Line search step size for first iteration
# init_alpha: 0.001

# Convergence tolerance on absolute changes in objective function value
# tol_obj: 1.e-12

# Convergence tolerance on relative changes in objective function value
# tol_rel_obj: 10000

# Convergence tolerance on the norm of the gradient
# tol_grad: 1.e-8

# Convergence tolerance on the relative norm of the gradient
# tol_rel_grad: 10000000

# Convergence tolerance on changes in parameter value
# tol_param: 1.e-8

# Amount of history to keep for L-BFGS
# history_size: 5

## Update Options
## ==========================

# DART-ID bootstraps the reference RT (mu), to account for uncertainty
# in the estimation and to penalize mu estimates derived from only a few data points (experiments)
# Ideally we would use the MCMC sampler (STAN) to sample the full posterior, but due to
# technical/performance constraints we are doing this in python instead

# options -- parametric-mixture, parametric, non-parametric, none
# bootstrap_method: 'parametric_mixture'
# bootstrap_iters: 100

# How to aggregate bootstrapped samples
# The weighted mean uses the PEP of each PSM as the weights
# options -- mean, median, weighted_mean
# mu_estimation: 'median'

## Protein Inference Options
## ==========================

# Run protein inference on the newly updated PSMs with the Fido framework
# https://noble.gs.washington.edu/proj/fido
# Paper in J. Proteome Research: http://dx.doi.org/10.1021/pr100594k

# Most, if not all, parameters described below are also described in detail
# on the Fido website and by the helper tips for the command-line verison of Fido.

# To run protein inference, set this flag to true
# run_pi: true

# Parameters derived from a parameter search and optimizing over an objective
# that minimizes selecting false positives.
# Parameters listed below are the default for fido. Leave these, or specify them,
# to skip the parameter searching step.
# Comment out these three parameters to search for the best set of 3 parameters and
# then run protein inference with those.
# pi_gamma: 0.5
# pi_alpha: 0.1
# pi_beta:  0.01

# Log2 of maximum number of subgraph connected states. Graphs with more states
# than this threshold will be pruned. Increasing this number increases run-time,
# by a lot!
# pi_connected_protein_thresh: 14
# Clean up the peptide sequence string, by removing adjacent amino acids,
# modifications, and also switching isoleucine to leucine.
# pi_clean_peptide_name: false
# Default behavior is to cut all PSMs except for the highest scoring one,
# for each peptide, in order to simplify the graph. Set this to true to include
# all PSMs
# pi_use_all_psms: false
# Use protein group level inference
# pi_group_proteins: false
# Prune low-scoring PSMs from the graph before the main pruning procedure.
# The threshold in this case is 1e-2 (PEP > 0.99)
# pi_prune_low_scores: true
# Accuracy of the parameter selection. This will be ignored if pi_gamma, pi_alpha,
# and pi_beta are provided, as the selection will not be performed in the first place.
# 1 = best    / slower     (uses entire data file)
# 2 = relaxed / faster     (uses 300 observations)
# 3 = sloppy  / very fast  (uses 100 observations)
# pi_parameter_accuracy: 3

# Proteins in the "Proteins" column are assumed to be protein IDs in a string,
# separated by a delimiter, which is specified here:
# i.e., the delimiter is ';' if the "Proteins" string is:
#       "Protein1;Protein2;Protein3;Protein4"
# pi_protein_delimiter: ';'
# A substring that delineates decoy proteins. In the case of MaxQuant,
# all decoy proteins are prepended with the string "REV__"
# pi_decoy_tag: 'REV__'


## Output
## ==========================

# Save the parameters outputted by STAN into three text files.
# Use the "params_folder" option in a future run to use these
# parameters instead of running the alignment procedure again.
# save_params: true

# Default behavior is to only append two columns, the new PEP
# and the updated PEP. Set this to true to get many more columns
# added on.
# add_diagnostic_cols: false

# Overwrite the original PEP column with the updated PEP, and save
# the original PEP to the 'Spectra PEP' column.
# Useful for workflows that rely on the PEP column
# overwrite_pep: false

# Remove PSMs that have an FDR (q-value) below this value.
# 0.01 corresponds to selecting PSMs at an FDR of 1%
# psm_fdr_threshold: 0.01

# Remove PSMs that have an associated protein FDR (q-value) below this value.
# 0.01 corresponds to selecting proteins at an FDR of 1%
# protein_fdr_threshold: 0.01


# If providing multiple input files, combine them all into one
# tabular file and save it.
# save_combined_output: true
# The name of the combined output file.
# combined_output_name: ev_updated.txt

# If providing separate input files, then save the output files separately
# as well. This can be used in conjunction with 'save_combined_output'
# save_separate_output: false
# Save the separate output files into the same folder where they originally
# came from. **WARNING** this program does not check to see if it will overwrite
# an existing file. Please choose the options below carefully to avoid overwriting
# your original data!
# save_in_input_folder: false
# The suffix and extension of each of the separate output files.
# For example, if one of the inputs was "evidence.txt",
# the output would be "evidence_updated.txt"
# output_suffix: _updated
# output_ext: .txt

# Save logging messages to file?
# log_file: true

## Filters
## ==========================

# Lower threshold of PEP. PSMs with PEP higher than this value will not be
# considered for the alignment process
# These PSMs can still have their confidence updated, as long as there are
# PSMs of the same sequence that have PEP below this value
# pep_threshold: 0.5

# Peptide sequences need to be observed in at least this number of experiments,
# at a PEP below the pep_threshold, in order to participate in the alignment process
# num_experiments: 3

# Minimum number of confident PSMs per experiment, in order to participate in RT alignment
# If an experiment has less than this number of confident PSMs, then all of its
# PSMs will be excluded from the RT alignment process
# min_psms_per_experiment: 50