-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy pathparameters_blueprint.yml
236 lines (196 loc) · 10.6 KB
/
parameters_blueprint.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# global options must be defined
global:
# Required: Path to a Fasta file
sequences_file: /path/to/sequences.fasta
# Required: String for output
prefix: my_embeddings
## Optional: file manager
# file_manager: [*filesystem]
## Optional: remap index simple (not via md5, this is not encouraged)
# simple_remapping: [True, *False]
## Stages are executed in sequential order as they are outlined in this file.
## Stage names must be different!! If not: they will overwrite each_other
## The same stage type (e.g. embed,..) can be executed multiple types.
## Dependencies for a stage are defined in the dependencies parameter
## This config file includes options for initializing classes and options specific to the protocol
## Options notation:
## *: denotes the default option
## @: denotes that the file or directory will be downloaded and stored locally if not provided
stage_0:
type: align
# Required: which protocol to use.
# Options: mmseqs_search, deepblast
protocol: mmseqs_search
# Required for mmseqs_search: search sequence to search against (can be FASTA, mmseqs db or mmseqs profile)
search_sequences_file: /path/to/a/sequence/db.fasta
# search_sequences_directory: /path/to/a/sequence/db/
# search_profiles_directory: /path/to/a/profiles/db/
# Optional for mmseqs_search: convert alignment to profile
# convert_to_profiles: [True,*False]
# Optional for mmseqs_search: alignment parameters (check bio_embeddings/align/mmseqs2.py#13 for available options)
# mmseqs_search_options:
# num_iterations: 5
# sensitivity: 7.5
# minimum_sequence_identity: 0.2
# maximum_number_of_prefilter_sequences: 100
# alignment_output: True
stage_1:
type: embed
# Required: which embedder to use
# Options: seqvec, prottrans_albert_bfd, prottrans_bert_bfd, prottrans_t5_bfd, prottrans_t5_uniref50,
# prottrans_t5_xl_u50, prottrans_xlnet_uniref100, cpcprot, esm, esm1b, esm1v, plus_rnn, unirep, bepler
protocol: prottrans_t5_xl_u50
# Optional: reduce embeddings to fixed size, per-protein. Comment out if not needed.
# Note that you can always compute the reduced embeddings from the full embeddings
# but some further stages (e.g. unsupervised extract) need this option
reduce: True
# Optional: discard per amino acid embeddings.
# Setting this parameter to True will disable storing full size embeddings (per amino acid).
# This parameter only works in combination with `reduce: True` or `embeddings_transformer_function`.
# discard_per_amino_acid_embeddings: [True, *False]
# Optional/Advanced: apply a transformation on the per-amino-acid embeddings
# === This is an advanced parameter ===
# This parameter will be "eval"-uated. It must be a callable. Most likely, you'll want to define lambda functions.
# The input of the function is a per-amino-acid embedding (aka. an np array of shape n_layers*embedding_dimension*sequence_length).
# You can use numpy functions via "np".
# The result of the transformation will be stored in the transformed_embeddings_file.
# The pipeline *WILL NOT* check file size of the prospective transformed_embeddings_file.
# This parameter can be used in conjunction with `discard_per_amino_acid_embeddings`.
# Two examples:
# - "lambda x: x[0].mean(0)" --> for SeqVec, this will return the mean pooled embedding of the first layer
# - "lambda x: x.max(0)" --> for ProtTrans-BERT-BFD, this will return the max pooled embedding, instead of mean pooled
# embeddings_transformer_function:
## Mandatory for esm1v: This defines which of the five models of the ensemble will be used
# ensemble_id: [1,2,3,4,5]
#### Optional parameters to instantiate classes
### Optional for protocol: seqvec
# weights_file: @/path/to/file
# options_file: @/path/to/file
## The following parameter sets an upper bound on total AA to include when embedding many sequences.
## Adjust this parameter if CUDA runs out of memory! The default (15.000) works for a 1080 with 8GB RAM.
# max_amino_acids: [*15000]
## The following parameters sets the amount of AA to include in a batch before writing to disk.
### Optional for protocol: fasttext, word2vec, glove, esm
# model_file: @/path/to/file
### Optional for protocol: albert, bert, xlnet
# model_directory: @/path/to/directory
### Optional for protocol: seqvec, prottrans_albert_bfd, prottrans_bert_bfd, prottrans_xlnet_uniref100, esm
## Set the following parameter to calculate embeddings on a specific device or a specific GPU on multi-GPU hosts.
## The default, "cuda", runs on the default GPU. You can switch to CPU with "cpu" or select a GPU on
## multi-GPU systems with "cuda:0" or "cuda:1" etc.
## See https://pytorch.org/docs/stable/tensor_attributes.html?highlight=device#torch.torch.device
## for a complete list
## unirep will use the cpu by default; Follow https://github.com/google/jax#pip-installation
## to run it on the gpu
# device: cuda
## Save numbers with lower precision, so that they take only about half the
## storage space (replace float32 with float16).
## This makes predictions based on the embeddings less exact.
# half_precision: [true, *false]
## For prottrans_t5_bfd, prottrans_t5_uniref50 and prottrans_t5_xl_u50 only:
# Use the model in half precision mode (float16)
## We recommend activating this with T5, since tested GPU (Quadro RTX 3000) reduces memory consumption
## from 12GB to 7GB while the effect in benchmarks is negligible (±0.1 percentages points in different sets,
## generally below standard error)
# half_precision_model: [true, *false]
stage_2:
type: project
# Required: which projection algorithm to use
# Options: tsne, umap, pb_tucker
protocol: tsne
# Either depend on an embedding stage with reduced embeddings
depends_on: stage_1
# or define mapping and reduced embedding file:
# reduced_embeddings_file: path/to/reduced_embeddings_file.h5
# mapping_file: path/to/mapping_file.csv
### Optional for protocol: tsne
# n_iter: *15000
# perplexity: *6
# n_jobs: *-1
### Optional for protocol: umap
# min_dist: *0.6
# spread: *1
# n_neighbors: *15
### Optional for protocol: tsne and umap
# metric: *'cosine'
# n_components: *3
# random_state: *420
# verbose: *1
## Optional with pb_tucker: Postprocess reduced embeddings with tucker, which is a contrastive
## learning model trained to distinguish CATH superfamilies. It reduces the dimensionality from 1024 to 128.
# model_file: @/path/to/file
stage_3:
type: mutagenesis
## Required: Which language model to use
## Options: protbert_bfd_mutagenesis
protocol: protbert_bfd_mutagenesis
## Optional: temperature for softmax, see https://arxiv.org/abs/1503.02531
# temperature: [1*]
## Optional: Since we're running ProtBert, the common ProtBert options explained in `extract` are supported
# model_directory: @/path/to/directory
# device: cuda
# half_precision: [true, *false]
# half_precision_model: [true, *false]
stage_4:
type: visualize
## Required: which graph to render
## Options: plotly, plot_mutagenesis
protocol: plotly
## For plotly: Either depend on a project stage with projected embeddings file
depends_on: stage_2
## or define projected_reduced_embeddings_file:
# projected_reduced_embeddings_file: path/to/projected_reduced_embeddings_file.h5
## For plot_mutagenesis: Either depend on a mutagenesis stage
# depends_on: stage_3
## or define residue_probabilities_file:
# residue_probabilities_file: path/to/residue_probabilities_file.csv
## Optional for plotly: csv file with annotations
## csv must have header: identifier, label
# annotation_file: path/to/annotation_file.csv
## Optional for plotly: hide proteins for which there is no annotation in the annotation file (only relevant if annotation file is provided)
# display_unknown: [False, *True]
## Optional for plotly: set to True if in annotation_file identifiers correspond to sequence MD5 hashes
## if set to False (default), mapping will be performed on original identifiers.
## Where missing or duplicate, will be ignored
# merge_via_index: [True, *False]
## Optional for plotly: 2D vs 3D plot
# n_components: [2,*3]
stage_5:
type: extract
## Required: which method to use.
## Current options:
## - seqvec_from_publication (Uses models evaluated in https://doi.org/10.1186/s12859-019-3220-8 )
## - bert_from_publication (Uses models evaluated in https://doi.org/10.1101/2020.07.12.199554 )
## - t5_xl_u50_from_publication (Uses models evaluated in https://doi.org/10.1101/2020.07.12.199554 )
## - unsupervised (Uses concepts presented in https://github.com/Rostlab/goPredSim )
protocol: t5_xl_u50_from_publication
## The supervised extract (bert_from_publication and seqvec_from_publication) needs
## the full embeddings, i.e. `discard_per_amino_acid_embeddings` must be false in the
## embed stage, which is the default. For the unsupervised extract, you need the
## reduced embeddings, i.e. set `reduce: True`.
## Instead of an embed stage, you can also provide a file manually, either supply for unsupervised:
# reduced_embeddings_file:
## or for bert_from_publication and seqvec_from_publication
# embeddings_file:
depends_on: stage_2
## Optional for protocol: seqvec_from_publication, bert_from_publication,
## will be downloaded if not supplied
# secondary_structure_checkpoint_file: path/to/checkpoint.pt
# subcellular_location_checkpoint_file: path/to/checkpoint.pt
## Required for protocol: unsupervised
# reference_embeddings_file: path/to/embeddings.hd5
# reference_annotations_file: path/to/annotation_file.csv
## Optional for protocol: unsupervised
## The following two options refer to the pairwise_distance function of scikit-learn 0.23.2
## https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html#sklearn.metrics.pairwise_distances
# n_jobs: [1*]
# metric: [euclidean*]
## The following will define how many neighbours to consider to transfer annotations.
## k = 1 means transfer the annotations from the nearest neighbor
## k > 1 will result in merging the annotations of all k > 1 neighbors onto the target embedding
# k_nearest_neighbours: [1*]
## The following informs the pipeline whether you want to keep the pairwise distance matrix file.
## This is a CSV containing pairwise distances between query and reference embeddings using your metric.
## The file can become quite big (e.g. UniProt Human (query) vs. SwissProt (reference) results in 45GB)
## By default, this file will be discarded, but you can decide to keep it to perform other calculations.
# keep_pairwise_distances_matrix_file: [*False]