herrius commited on
Commit
32b542e
·
1 Parent(s): a9a1d7a

Upload 259 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +145 -0
  2. LICENSE +224 -0
  3. README.md +201 -3
  4. configs/BERT_L12_H192_experiments/4tasks_training.yaml +729 -0
  5. configs/BERT_L12_H192_experiments/4tasks_training_small_datasets.yaml +292 -0
  6. configs/BERT_L12_H192_experiments/7tasks_berttiny_training.yaml +416 -0
  7. configs/BERT_L12_H192_experiments/7tasks_berttiny_training_apex_o2.yaml +9 -0
  8. configs/BERT_L12_H192_experiments/7tasks_berttiny_training_lamb.yaml +418 -0
  9. configs/BERT_L12_H192_experiments/7tasks_berttiny_training_moe.yaml +25 -0
  10. configs/BERT_L12_H192_experiments/7tasks_berttiny_training_moe_lsfp32_gate_softmax_layernorm_fp16.yaml +42 -0
  11. configs/BERT_L12_H192_experiments/7tasks_berttiny_training_moe_scale_before.yaml +444 -0
  12. configs/BERT_L12_H192_experiments/base_model_bert_l12_h192.yaml +73 -0
  13. configs/BERT_L12_H192_experiments/in1k_training.yaml +197 -0
  14. configs/BERT_L12_H192_experiments/in1k_training_moe.yaml +219 -0
  15. configs/BERT_L12_H192_experiments/moe_debug.yaml +536 -0
  16. configs/BERT_L12_H192_experiments/moe_debug_load_ds_checkpoint.yaml +541 -0
  17. configs/BERT_L12_H192_experiments/mscoco_caption_debug.yaml +234 -0
  18. configs/BERT_L12_H192_experiments/vqa_debug.yaml +189 -0
  19. configs/BERT_L12_H384_experiments/base_model_bert_l12_h384.yaml +80 -0
  20. configs/BERT_L12_H384_experiments/in1k_training.yaml +189 -0
  21. configs/BERT_L12_H768_experiments/16tasks_training.yaml +738 -0
  22. configs/BERT_L12_H768_experiments/16tasks_training_apex_o2.yaml +11 -0
  23. configs/BERT_L12_H768_experiments/16tasks_training_basedense_stage1_64gpu.yaml +739 -0
  24. configs/BERT_L12_H768_experiments/16tasks_training_basedense_stage2_64gpu.yaml +750 -0
  25. configs/BERT_L12_H768_experiments/16tasks_training_basemoe_stage1_56gpu.yaml +733 -0
  26. configs/BERT_L12_H768_experiments/16tasks_training_basemoe_stage2_56gpu.yaml +744 -0
  27. configs/BERT_L12_H768_experiments/16tasks_training_stage2_64gpu_v1.yaml +750 -0
  28. configs/BERT_L12_H768_experiments/base_model_bert_l12_h768.yaml +73 -0
  29. configs/BERT_L12_H768_experiments/bw_mlm_training.yaml +309 -0
  30. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_CoLA_mlm_finetune.yaml +89 -0
  31. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_MNLI_mlm_finetune.yaml +89 -0
  32. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_MRPC_mlm_finetune.yaml +88 -0
  33. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_QNLI_mlm_finetune.yaml +85 -0
  34. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_QQP_mlm_finetune.yaml +84 -0
  35. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_RTE_mlm_finetune.yaml +92 -0
  36. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_SST2_mlm_finetune.yaml +89 -0
  37. configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/base.yaml +22 -0
  38. configs/BERT_L12_H768_experiments/finetuning/flickr30k_caption_finetuning.yaml +151 -0
  39. configs/BERT_L12_H768_experiments/finetuning/flickr30k_retrieval_finetuning.yaml +132 -0
  40. configs/BERT_L12_H768_experiments/finetuning/in1k_training.yaml +135 -0
  41. configs/BERT_L12_H768_experiments/finetuning/in1k_training_384inputsize.yaml +134 -0
  42. configs/BERT_L12_H768_experiments/finetuning/k400_training.yaml +133 -0
  43. configs/BERT_L12_H768_experiments/finetuning/mscoco_caption_finetuning.yaml +150 -0
  44. configs/BERT_L12_H768_experiments/finetuning/mscoco_retrieval_finetuning.yaml +132 -0
  45. configs/BERT_L12_H768_experiments/finetuning/msvd_caption_finetuning.yaml +144 -0
  46. configs/BERT_L12_H768_experiments/finetuning/msvd_retrieval_finetuning.yaml +129 -0
  47. configs/BERT_L12_H768_experiments/finetuning/msvd_retrieval_finetuning_frames8.yaml +125 -0
  48. configs/BERT_L12_H768_experiments/finetuning/vqa_finetuning_debug.yaml +127 -0
  49. configs/BERT_L12_H768_experiments/in1k_training.yaml +310 -0
  50. configs/BERT_L12_H768_experiments/moe_finetuning/GLUE_finetuning_experiments/GLUE_CoLA_mlm_finetune.yaml +89 -0
.gitignore ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ .vscode
132
+ .vscode/*
133
+ .DS_Store
134
+
135
+ output/*
136
+ work_dirs
137
+ work_dirs/*
138
+ work_dirs/
139
+
140
+ data/temp/*
141
+ slurm_tools/
142
+ slurm_run.sh
143
+
144
+ core*
145
+ dist_url_*
LICENSE ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2022 - present, SenseTime. All Rights Reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright 2022 - present, SenseTime
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
204
+
205
+
206
+ X-modaler
207
+
208
+ Copyright 2021 Jingdong Technology Information Technology Co., Ltd
209
+
210
+ Licensed under the Apache License, Version 2.0 (the "License");
211
+ you may not use this file except in compliance with the License.
212
+ You may obtain a copy of the License at
213
+
214
+ http://www.apache.org/licenses/LICENSE-2.0
215
+
216
+ Unless required by applicable law or agreed to in writing, software
217
+ distributed under the License is distributed on an "AS IS" BASIS,
218
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
219
+ See the License for the specific language governing permissions and
220
+ limitations under the License.
221
+
222
+
223
+
224
+
README.md CHANGED
@@ -1,3 +1,201 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Uni-Perceiver
2
+
3
+ This repository contains training (pre-training, fine-tuning, prompt-tuning), evaluation code and pretrained models for the following papers:
4
+
5
+
6
+ > [Uni-Perceiver](https://arxiv.org/abs/2112.01522): Pre-training Unified Architecture for Generic Perception for Zero-shot and Few-shot Tasks, CVPR 2022.
7
+
8
+ > [Uni-Perceiver-MoE](https://arxiv.org/abs/2206.04674): Learning Sparse Generalist Models with Conditional MoEs, NeurIPS 2022.
9
+
10
+
11
+
12
+ ## Introduction
13
+
14
+ __Uni-Perceiver__ is a generalist model (generic perception model) that can process a variety of modalities and tasks with unified modeling and shared
15
+ parameters. Different perception tasks are modeled as the same formulation, that is, finding the maximum likelihood target for each input through the similarity of their representations. Meanwhile, Uni-Perceiver is pre-trained on several uni-modal and multi-modal tasks, and evaluated on a variety of downstream tasks, including novel tasks that did not appear in the pre-training stage.
16
+ Thanks to the unified formulation, it shows the ability of zero-shot inference on novel tasks, and shows promising performance close to or on par with SOTA results by prompt tuning or finetuning.
17
+
18
+ ![UnPerceiver-intro](./figs/overview.png)
19
+
20
+ In __Uni-Perceiver-MoE__, we found that the interference among different tasks and modalities can lead to performance degradation of generalist models on some tasks compared with task-specialized models. We introduce the Conditional Mixture-of-Experts (Conditional MoEs) to mitigate such interference. By incorporating the proposed Conditional MoEs, Uni-Perceiver-MoE can effectively mitigate the interference across tasks and modalities, and achieves state-of-the-art results on a series of downstream tasks via prompt tuning on 1% of downstream data. Moreover, the introduction of Conditional MoEs still holds the generalization ability of generalist models to conduct zero-shot inference on new tasks,
21
+
22
+ ![UnPerceiver-moe-intro](./figs/overview_moe.png)
23
+
24
+
25
+ ## Main Results and Pretrained Models
26
+
27
+ ### Base Models
28
+
29
+ <table border="1" width="100%">
30
+ <tr align="center">
31
+ <th>Task</th>
32
+ <th>Image Classification</th>
33
+ <th colspan="2">Image Caption</th>
34
+ <th colspan="4">Image Retrieval</th>
35
+ <th>Video Classification</th><th>Video Caption</th><th colspan="2">Video Retrieval</th>
36
+ </tr>
37
+ <tr align="center">
38
+ <td>Dataset</td><td>ImageNet-1k</td><td>MSCOCO</td><td>Flickr30k</td><td colspan="2">MSCOCO</td><td colspan="2">Flickr30k</td><td>Kinetics-400</td><td>MSVD</td><td colspan="2">MSVD</td>
39
+ </tr>
40
+ <tr align="center">
41
+ <td>Split</td><td>ILSVRC 2012 val</td><td>Karpathy test</td><td>test</td><td colspan="2">Karpathy test</td><td colspan="2">test</td><td>test-dev</td><td>val</td><td>val</td><td colspan="2">val</td>
42
+ </tr>
43
+ <tr align="center">
44
+ <td>Metric</td><td>Acc@1</td><td>BLEU-4</td><td>BLEU-4</td><td>R@1 i2t</td><td>R@1 t2i</td><td>R@1 i2t</td><td>R@1 t2i</td><td>Acc@1</td><td>BLEU-4</td><td>R@1 v2t</td><td>R@1 t2v</td>
45
+ </tr>
46
+ </tr>
47
+ <tr align="center">
48
+ <td>Uni-Perceiver<sub>BASE</sub> w/o Tuning</td><td>79.2 </td><td>32.0</td><td>14.7 </td><td>64.9 </td><td>50.7 </td><td>82.3 </td><td>71.1</td> <td>74.5 </td><td>22.6 </td><td>50.3</td><td>38.7 </td>
49
+ </tr>
50
+ <tr align="center">
51
+ <td>Uni-Perceiver<sub>BASE</sub> PT (1%)</td><td>80.9 </td><td>35.5</td><td>30.2</td><td>68.4 </td><td>51.9 </td><td>91.0 </td><td>76.0 </td><td>74.8 </td><td>59.5 </td><td>62.7 </td><td>43.8 </td>
52
+ </tr>
53
+ <tr align="center">
54
+ <td>Uni-Perceiver<sub>BASE</sub> FT (100%)</td><td>84.0</td><td>36.4 </td><td>31.2 </td><td>69.8</td><td>53.9 </td><td>92.7</td><td>77.5</td><td>77.7 </td><td>63.3 </td><td>62.8</td><td>45.8 </td>
55
+ </tr>
56
+
57
+ <tr align="center">
58
+ <td>Uni-Perceiver-MoE<sub>BASE</sub> w/o Tuning</td><td>80.3 </td><td>33.2</td><td>15.9 </td><td>64.6 </td><td>51.6 </td><td>82.1 </td><td>75.8</td> <td>76.8 </td><td>23.4 </td><td>52.8</td><td>40.0 </td>
59
+ </tr>
60
+ <tr align="center">
61
+ <td>Uni-Perceiver-MoE<sub>BASE</sub> PT (1%)</td><td>82.0 </td><td>36.8</td><td>30.7</td><td>68.9 </td><td>52.6 </td><td>91.3 </td><td>78.5 </td><td>77.2 </td><td>60.0 </td><td>65.6 </td><td>45.3 </td>
62
+ </tr>
63
+ <tr align="center">
64
+ <td>Uni-Perceiver-MoE<sub>BASE</sub> FT (100%)</td><td>84.5</td><td>37.3 </td><td>32.4 </td><td>70.5</td><td>54.1 </td><td>93.6</td><td>79.8</td><td>79.3 </td><td>65.4 </td><td>65.0</td><td>47.8 </td>
65
+ </tr>
66
+ </table>
67
+
68
+
69
+ ### Large Models
70
+
71
+ <table border="1" width="100%">
72
+ <tr align="center">
73
+ <th>Task</th>
74
+ <th>Image Classification</th>
75
+ <th colspan="2">Image Caption</th>
76
+ <th colspan="4">Image Retrieval</th>
77
+ <th>Video Classification</th><th>Video Caption</th><th colspan="2">Video Retrieval</th>
78
+ </tr>
79
+ <tr align="center">
80
+ <td>Dataset</td><td>ImageNet-1k</td><td>MSCOCO</td><td>Flickr30k</td><td colspan="2">MSCOCO</td><td colspan="2">Flickr30k</td><td>Kinetics-400</td><td>MSVD</td><td colspan="2">MSVD</td>
81
+ </tr>
82
+ <tr align="center">
83
+ <td>Split</td><td>ILSVRC 2012 val</td><td>Karpathy test</td><td>test</td><td colspan="2">Karpathy test</td><td colspan="2">test</td><td>test-dev</td><td>val</td><td>val</td><td colspan="2">val</td>
84
+ </tr>
85
+ <tr align="center">
86
+ <td>Metric</td><td>Acc@1</td><td>BLEU-4</td><td>BLEU-4</td><td>R@1 i2t</td><td>R@1 t2i</td><td>R@1 i2t</td><td>R@1 t2i</td><td>Acc@1</td><td>BLEU-4</td><td>R@1 v2t</td><td>R@1 t2v</td>
87
+ </tr>
88
+ <tr align="center">
89
+ <td>Uni-Perceiver<sub>LARGE</sub> w/o Tuning</td><td>82.7 </td><td> 35.3 </td><td> 15.1 </td><td>67.8 </td><td>54.1 </td><td> 83.7</td><td> 74.2 </td><td> 79.5</td><td>24.7 </td><td> 45.4 </td><td>34.2 </td>
90
+ </tr>
91
+ <tr align="center">
92
+ <td>Uni-Perceiver<sub>LARGE</sub> PT (1%)</td><td>84.2 </td><td>38.6 </td><td> 32.9</td><td> 73.3 </td><td>56.2 </td><td>92.1 </td><td> 80.0</td><td> 80.0</td><td> 67.2</td><td> 65.5 </td><td>48.6 </td>
93
+ </tr>
94
+ <tr align="center">
95
+ <td>Uni-Perceiver<sub>LARGE</sub> FT (100%)</td><td>86.2 </td><td> 39.2 </td><td> 35.5 </td><td>74.4 </td><td>57.9 </td><td>94.7 </td><td> 82.1</td><td>81.9 </td><td>68.3 </td><td> 65.2 </td><td>50.8 </td>
96
+ </tr>
97
+
98
+ <tr align="center">
99
+ <td>Uni-Perceiver-MoE<sub>LARGE</sub> w/o Tuning</td><td>83.4 </td><td> 35.5 </td><td> 15.8 </td><td>67.9 </td><td>55.3 </td><td> 83.6</td><td> 75.9 </td><td> 82.1</td><td>24.6 </td><td> 45.7 </td><td>41.9 </td>
100
+ </tr>
101
+ <tr align="center">
102
+ <td>Uni-Perceiver-MoE<sub>LARGE</sub> PT (1%)</td><td>84.9 </td><td>39.3 </td><td> 33.7</td><td> 73.3 </td><td>57.1 </td><td>92.4 </td><td> 80.6</td><td> 83.0</td><td> 67.6</td><td> 66.4 </td><td>50.3 </td>
103
+ </tr>
104
+ <tr align="center">
105
+ <td>Uni-Perceiver-MoE<sub>LARGE</sub> FT (100%)</td><td>86.4 </td><td> 40.5 </td><td> 36.2 </td><td>74.7 </td><td>58.3 </td><td>94.1 </td><td> 83.7</td><td>84.2 </td><td>68.9 </td><td> 67.6 </td><td>52.3 </td>
106
+ </tr>
107
+ </table>
108
+
109
+ * The numbers are slightly better than the original paper of Uni-Perceiver, which are from the reproduced version of Uni-Perceiver used as the baseline of [Uni-Perceiver-MoE](https://arxiv.org/abs/2206.04674).
110
+ * The image resolution for all tasks is `224x224`.
111
+ * See [OtherResults.md](data/other_results.md) for results on more tasks and datasets.
112
+
113
+
114
+
115
+ ## Usage
116
+ ### Requirements
117
+ * Linux, CUDA>=10.1, GCC>=5.4
118
+
119
+ * Python >=3.7
120
+
121
+ * pytorch >= 1.8.0
122
+
123
+ * JAVA >= 1.8 (for caption task evaluation)
124
+
125
+
126
+ ### Installation
127
+ ```bash
128
+ git clone https://github.com/fundamentalvision/Uni-Perceiver
129
+ cd Uni-Perceiver
130
+ pip install -r requirements.txt
131
+ ```
132
+
133
+
134
+ ### Data
135
+ See [prepare_data.md](data/prepare_data.md).
136
+
137
+ ### Pre-trained Model Weights
138
+ See [checkpoints.md](data/checkpoints.md).
139
+
140
+
141
+ ### Pre-training
142
+ See [pretraining.md](data/pretraining.md).
143
+
144
+ ### Fine-tuning
145
+ See [finetuning.md](data/finetuning.md).
146
+
147
+ ### Prompt-tuning
148
+
149
+ See [prompt_tuning.md](data/prompt_tuning.md).
150
+
151
+
152
+ ### Inference
153
+
154
+ See [inference.md](data/inference.md).
155
+
156
+ ### TODO
157
+
158
+ * release more pretrained models
159
+ - [ ] Uni-Perceiver Tiny model
160
+ - [ ] Uni-Perceiver Small model
161
+ - [ ] Uni-Perceiver Huge model
162
+
163
+ * support more datasets and tasks
164
+
165
+
166
+
167
+ ## License
168
+ Uni-Perceiver is licensed under the [Apache-2.0 License](./LICENSE).
169
+
170
+
171
+ <br></br>
172
+
173
+ ## Citing Uni-Perceiver
174
+ If you find Uni-Perceiver useful in your research, please consider giving a star ⭐ and citing:
175
+ ```bibtex
176
+ @article{zhu2021uni,
177
+ title={Uni-Perceiver: Pre-training Unified Architecture for Generic Perception for Zero-shot and Few-shot Tasks},
178
+ author={Zhu, Xizhou and Zhu, Jinguo and Li, Hao and Wu, Xiaoshi and Wang, Xiaogang and Li, Hongsheng and Wang, Xiaohua and Dai, Jifeng},
179
+ journal={arXiv preprint arXiv:2112.01522},
180
+ year={2021}
181
+
182
+ }
183
+ ```
184
+
185
+ ```bibtex
186
+ @article{zhu2022uni,
187
+ title={Uni-Perceiver-MoE: Learning Sparse Generalist Models with Conditional MoEs},
188
+ author={Zhu, Jinguo and Zhu, Xizhou and Wang, Wenhai and Wang, Xiaohua and Li, Hongsheng and Wang, Xiaogang and Dai, Jifeng},
189
+ journal={arXiv preprint arXiv:2206.04674},
190
+ year={2022}
191
+ }
192
+ ```
193
+
194
+ ### Acknowledgements
195
+ Many thanks to following codes that help us a lot in building this codebase:
196
+ * [Detectron2](https://github.com/facebookresearch/detectron2)
197
+ * [X-modaler](https://github.com/YehLi/xmodaler)
198
+ * [deit](https://github.com/facebookresearch/deit)
199
+ * [VL-BERT](https://github.com/jackroos/VL-BERT)
200
+ * [TimeSformer](https://github.com/facebookresearch/TimeSformer)
201
+ * [CLIP](https://github.com/openai/CLIP)
configs/BERT_L12_H192_experiments/4tasks_training.yaml ADDED
@@ -0,0 +1,729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+
18
+
19
+ TASKS:
20
+
21
+ -
22
+ NAME: imagenet
23
+ DATASETS:
24
+ TRAIN: 'ImageNetDataset'
25
+ # VAL: 'ImageNetDataset'
26
+ TASK_TYPE: 'image_classification'
27
+ DATASET_NAME: 'ImageNet1k'
28
+ TARGET_SET: ['ImageNet1k']
29
+
30
+ DATALOADER:
31
+ TRAIN_BATCH_SIZE: 4
32
+ # TEST_BATCH_SIZE: 2
33
+ NUM_WORKERS: 4
34
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
35
+ S3_PATH: 'cluster2:s3://imagenet'
36
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
37
+ SAMPLING_WEIGHT: 1.0
38
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
39
+ MIXUP: 0.8
40
+ CUTMIX: 1.0
41
+ MIXUP_PROB: 1.0
42
+ MIXUP_SWITCH_PROB: 0.5
43
+ MIXUP_MODE: 'batch'
44
+ MIXUP_LABEL_SMOOTHING: 0.1
45
+ MODEL:
46
+ MAX_SEQ_LEN: -1
47
+ LABELS_NUM: 1000
48
+ TEMP_NAME: logit_scale_img_cls
49
+ LOSSES:
50
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
51
+ LOSS_WEIGHT: 1.0
52
+ REDUCTION: 'mean'
53
+ # LOSS_FP32: True
54
+ INFERENCE:
55
+ NAME: 'ImageNetEvaler'
56
+ ID_KEY: 'image_id'
57
+ VALUE: 'cls_logits'
58
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
59
+ TEST_ANNFILE: ''
60
+ GENERATION_MODE: False
61
+
62
+ -
63
+ NAME: mscoco_caption
64
+ DATASETS:
65
+ TRAIN: 'ImageTextPairDataset'
66
+ # VAL: 'ImageTextPairDataset'
67
+ # TEST: 'ImageTextPairDataset'
68
+ TASK_TYPE: 'image_caption'
69
+ DATASET_NAME: 'MSCOCO'
70
+ TARGET_SET: ['Vocab_Word']
71
+ DATALOADER:
72
+ TRAIN_BATCH_SIZE: 64
73
+ TEST_BATCH_SIZE: 32
74
+ NUM_WORKERS: 4
75
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
76
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
77
+ S3_PATH: 's3://coco/'
78
+ SEQ_PER_SAMPLE: 1
79
+ CACHE_MODE: True
80
+ CIRCULAR_CACHE_MODE: False
81
+ ZIP_MODE: False
82
+ CACHE_ORIGIN_IMAGE: False
83
+ RANDOM_CAPTION: False
84
+ AS_NUMPY_AS_POSSIBLE: False
85
+ SAMPLING_WEIGHT: 1.0
86
+ TRANSFORM: 'clip_transforms'
87
+ RANDOM_MASK: True
88
+ MODEL:
89
+ MAX_SEQ_LEN: 50
90
+ EVAL_MAX_SEQ_LEN: 21
91
+ TEMP_NAME: logit_scale_caption
92
+ LOSSES:
93
+ NAMES: ['CrossEntropy', 'Accuracy']
94
+ LOSS_WEIGHT: 0.33333
95
+ REDUCTION: 'mean'
96
+ DECODE_STRATEGY:
97
+ NAME: 'CaptionBeamSearcherV3'
98
+ BEAM_SIZE: 2
99
+ # LEN_PENALTY: 1.0
100
+ INFERENCE:
101
+ NAME: 'COCOEvaler'
102
+ VOCAB: 'CLIP'
103
+ ID_KEY: 'image_id'
104
+ VALUE: 'caption'
105
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
106
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
107
+ GENERATION_MODE: True
108
+
109
+ -
110
+ NAME: yfcc_caption
111
+ DATASETS:
112
+ TRAIN: 'ImageTextPairDataset'
113
+ TASK_TYPE: 'image_caption'
114
+ DATASET_NAME: 'YFCC'
115
+ TARGET_SET: ['Vocab_Word']
116
+ DATALOADER:
117
+ TRAIN_BATCH_SIZE: 64
118
+ TEST_BATCH_SIZE: 32
119
+ NUM_WORKERS: 2
120
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
121
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
122
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
123
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
124
+ S3_PATH: 'cluster2:s3://yfcc/'
125
+ SEQ_PER_SAMPLE: 1
126
+ SAMPLER: NodeDistributed
127
+ CACHE_MODE: True
128
+ CIRCULAR_CACHE_MODE: False
129
+ ZIP_MODE: False
130
+ CACHE_ORIGIN_IMAGE: False
131
+ RANDOM_CAPTION: True
132
+ AS_NUMPY_AS_POSSIBLE: False
133
+ SAMPLING_WEIGHT: 1.0
134
+ TRANSFORM: 'clip_transforms'
135
+ MODEL:
136
+ MAX_SEQ_LEN: 50
137
+ TEMP_NAME: logit_scale_caption
138
+ LOSSES:
139
+ NAMES: ['CrossEntropy', 'Accuracy']
140
+ LOSS_WEIGHT: 1.0
141
+ REDUCTION: 'mean'
142
+ INFERENCE:
143
+ VOCAB: 'CLIP'
144
+ GENERATION_MODE: False
145
+
146
+ -
147
+ NAME: cc12m_caption
148
+ DATASETS:
149
+ TRAIN: 'ImageTextPairDataset'
150
+ TASK_TYPE: 'image_caption'
151
+ DATASET_NAME: 'CC12M'
152
+ TARGET_SET: ['Vocab_Word']
153
+ DATALOADER:
154
+ TRAIN_BATCH_SIZE: 64
155
+ TEST_BATCH_SIZE: 32
156
+ NUM_WORKERS: 2
157
+ S3_ANNO_FOLDER: 's3://cc12m/'
158
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
159
+ ANNO_FILENAME: 'train_available.json'
160
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
161
+ S3_PATH: 's3://cc12m/'
162
+ SEQ_PER_SAMPLE: 1
163
+ SAMPLER: NodeDistributed
164
+ CACHE_MODE: True
165
+ CIRCULAR_CACHE_MODE: False
166
+ ZIP_MODE: False
167
+ CACHE_ORIGIN_IMAGE: False
168
+ RANDOM_CAPTION: False
169
+ AS_NUMPY_AS_POSSIBLE: False
170
+ SAMPLING_WEIGHT: 1.0
171
+ TRANSFORM: 'clip_transforms'
172
+ MODEL:
173
+ MAX_SEQ_LEN: 50
174
+ TEMP_NAME: logit_scale_caption
175
+ LOSSES:
176
+ NAMES: ['CrossEntropy', 'Accuracy']
177
+ LOSS_WEIGHT: 1.0
178
+ REDUCTION: 'mean'
179
+ INFERENCE:
180
+ VOCAB: 'CLIP'
181
+ GENERATION_MODE: False
182
+
183
+ -
184
+ NAME: cc3m_caption
185
+ DATASETS:
186
+ TRAIN: 'ImageTextPairDataset'
187
+ TASK_TYPE: 'image_caption'
188
+ DATASET_NAME: 'CC3M'
189
+ TARGET_SET: ['Vocab_Word']
190
+ DATALOADER:
191
+ TRAIN_BATCH_SIZE: 64
192
+ TEST_BATCH_SIZE: 32
193
+ NUM_WORKERS: 2
194
+ S3_ANNO_FOLDER: 's3://cc3m/'
195
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
196
+ ANNO_FILENAME: 'train_spacy.json'
197
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
198
+ S3_PATH: 's3://cc3m/'
199
+ SEQ_PER_SAMPLE: 1
200
+ SAMPLER: NodeDistributed
201
+ CACHE_MODE: True
202
+ CIRCULAR_CACHE_MODE: False
203
+ ZIP_MODE: False
204
+ CACHE_ORIGIN_IMAGE: False
205
+ RANDOM_CAPTION: False
206
+ AS_NUMPY_AS_POSSIBLE: False
207
+ SAMPLING_WEIGHT: 1.0
208
+ TRANSFORM: 'clip_transforms'
209
+ MODEL:
210
+ MAX_SEQ_LEN: 50
211
+ TEMP_NAME: logit_scale_caption
212
+ LOSSES:
213
+ NAMES: ['CrossEntropy', 'Accuracy']
214
+ LOSS_WEIGHT: 1.0
215
+ REDUCTION: 'mean'
216
+ INFERENCE:
217
+ VOCAB: 'CLIP'
218
+ GENERATION_MODE: False
219
+
220
+ -
221
+ NAME: sbu_caption
222
+ DATASETS:
223
+ TRAIN: 'ImageTextPairDataset'
224
+ TASK_TYPE: 'image_caption'
225
+ DATASET_NAME: 'SBU'
226
+ TARGET_SET: ['Vocab_Word']
227
+ DATALOADER:
228
+ TRAIN_BATCH_SIZE: 64
229
+ TEST_BATCH_SIZE: 32
230
+ NUM_WORKERS: 1
231
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
232
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
233
+ ANNO_FILENAME: 'subcaption.json'
234
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
235
+ S3_PATH: 's3://SBU/images'
236
+ SEQ_PER_SAMPLE: 1
237
+ SAMPLER: NodeDistributed
238
+ CACHE_MODE: True
239
+ CIRCULAR_CACHE_MODE: False
240
+ ZIP_MODE: False
241
+ CACHE_ORIGIN_IMAGE: False
242
+ RANDOM_CAPTION: False
243
+ AS_NUMPY_AS_POSSIBLE: False
244
+ SAMPLING_WEIGHT: 1.0
245
+ TRANSFORM: 'clip_transforms'
246
+ MODEL:
247
+ MAX_SEQ_LEN: 50
248
+ TEMP_NAME: logit_scale_caption
249
+ LOSSES:
250
+ NAMES: ['CrossEntropy', 'Accuracy']
251
+ LOSS_WEIGHT: 1.0
252
+ REDUCTION: 'mean'
253
+ INFERENCE:
254
+ VOCAB: 'CLIP'
255
+ GENERATION_MODE: False
256
+
257
+ -
258
+ NAME: vg_caption
259
+ DATASETS:
260
+ TRAIN: 'ImageTextPairDataset'
261
+ TASK_TYPE: 'image_caption'
262
+ DATASET_NAME: 'VG'
263
+ TARGET_SET: ['Vocab_Word']
264
+ DATALOADER:
265
+ TRAIN_BATCH_SIZE: 64
266
+ TEST_BATCH_SIZE: 32
267
+ NUM_WORKERS: 2
268
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
269
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
270
+ S3_PATH: 's3://visual_genome/images'
271
+ ANNO_FILENAME: 'vg_captions_128filter.json'
272
+ SEQ_PER_SAMPLE: 1
273
+ CACHE_MODE: True
274
+ CIRCULAR_CACHE_MODE: False
275
+ ZIP_MODE: False
276
+ CACHE_ORIGIN_IMAGE: False
277
+ RANDOM_CAPTION: False
278
+ AS_NUMPY_AS_POSSIBLE: False
279
+ SAMPLING_WEIGHT: 1.0
280
+ TRANSFORM: 'clip_transforms'
281
+ MODEL:
282
+ MAX_SEQ_LEN: 30
283
+ TEMP_NAME: logit_scale_caption
284
+ LOSSES:
285
+ NAMES: ['CrossEntropy', 'Accuracy']
286
+ LOSS_WEIGHT: 1.0
287
+ REDUCTION: 'mean'
288
+ INFERENCE:
289
+ VOCAB: 'CLIP'
290
+ GENERATION_MODE: True
291
+
292
+ -
293
+ NAME: mscoco_retrieve
294
+ DATASETS:
295
+ TRAIN: 'ImageTextPairDataset'
296
+ # TEST: 'ImageTextPairDataset'
297
+ TASK_TYPE: 'image_retrieval'
298
+ DATASET_NAME: 'MSCOCO'
299
+ DATALOADER:
300
+ TRAIN_BATCH_SIZE: 100
301
+ TEST_BATCH_SIZE: 32
302
+ NUM_WORKERS: 1
303
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
304
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
305
+ S3_PATH: 's3://coco/'
306
+ SEQ_PER_SAMPLE: 1
307
+ CACHE_MODE: True
308
+ CIRCULAR_CACHE_MODE: False
309
+ ZIP_MODE: False
310
+ CACHE_ORIGIN_IMAGE: False
311
+ RANDOM_CAPTION: False
312
+ AS_NUMPY_AS_POSSIBLE: False
313
+ SAMPLING_WEIGHT: 1.0
314
+ TRANSFORM: 'clip_transforms'
315
+ MODEL:
316
+ MAX_SEQ_LEN: 50
317
+ TEMP_NAME: logit_scale_retrieve
318
+ LOSSES:
319
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
320
+ LABELSMOOTHING: 0.1
321
+ LOSS_WEIGHT: 1.0
322
+ REDUCTION: 'mean'
323
+ INFERENCE:
324
+ VOCAB: 'CLIP'
325
+ ID_KEY: 'image_id'
326
+ VALUE: 'caption'
327
+ NAME: 'RetrievalEvaler'
328
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
329
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
330
+ GENERATION_MODE: False
331
+
332
+ -
333
+ NAME: yfcc_retrieve
334
+ DATASETS:
335
+ TRAIN: 'ImageTextPairDataset'
336
+ TASK_TYPE: 'image_retrieval'
337
+ DATASET_NAME: 'YFCC'
338
+ DATALOADER:
339
+ TRAIN_BATCH_SIZE: 64
340
+ TEST_BATCH_SIZE: 32
341
+ NUM_WORKERS: 2
342
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
343
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
344
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
345
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
346
+ S3_PATH: 'cluster2:s3://yfcc/'
347
+ SAMPLER: NodeDistributed
348
+ CACHE_MODE: True
349
+ CIRCULAR_CACHE_MODE: False
350
+ ZIP_MODE: False
351
+ CACHE_ORIGIN_IMAGE: False
352
+ RANDOM_CAPTION: True
353
+ AS_NUMPY_AS_POSSIBLE: False
354
+ SAMPLING_WEIGHT: 1.0
355
+ TRANSFORM: 'clip_transforms'
356
+ MODEL:
357
+ MAX_SEQ_LEN: 50
358
+ TEMP_NAME: logit_scale_retrieve
359
+ LOSSES:
360
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
361
+ LABELSMOOTHING: 0.1
362
+ LOSS_WEIGHT: 0.5
363
+ REDUCTION: 'mean'
364
+ INFERENCE:
365
+ VOCAB: 'CLIP'
366
+ GENERATION_MODE: False
367
+
368
+ -
369
+ NAME: cc12m_retrieve
370
+ DATASETS:
371
+ TRAIN: 'ImageTextPairDataset'
372
+ TASK_TYPE: 'image_retrieval'
373
+ DATASET_NAME: 'CC12M'
374
+ DATALOADER:
375
+ TRAIN_BATCH_SIZE: 64
376
+ TEST_BATCH_SIZE: 32
377
+ NUM_WORKERS: 2
378
+ S3_ANNO_FOLDER: 's3://cc12m/'
379
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
380
+ ANNO_FILENAME: 'train_available.json'
381
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
382
+ S3_PATH: 's3://cc12m/'
383
+ SAMPLER: NodeDistributed
384
+ CACHE_MODE: True
385
+ CIRCULAR_CACHE_MODE: False
386
+ ZIP_MODE: False
387
+ CACHE_ORIGIN_IMAGE: False
388
+ RANDOM_CAPTION: False
389
+ AS_NUMPY_AS_POSSIBLE: False
390
+ SAMPLING_WEIGHT: 1.0
391
+ TRANSFORM: 'clip_transforms'
392
+ MODEL:
393
+ MAX_SEQ_LEN: 50
394
+ TEMP_NAME: logit_scale_retrieve
395
+ LOSSES:
396
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
397
+ LABELSMOOTHING: 0.1
398
+ LOSS_WEIGHT: 0.5
399
+ REDUCTION: 'mean'
400
+ INFERENCE:
401
+ VOCAB: 'CLIP'
402
+ GENERATION_MODE: False
403
+
404
+ -
405
+ NAME: cc3m_retrieve
406
+ DATASETS:
407
+ TRAIN: 'ImageTextPairDataset'
408
+ TASK_TYPE: 'image_retrieval'
409
+ DATASET_NAME: 'CC3M'
410
+ DATALOADER:
411
+ TRAIN_BATCH_SIZE: 64
412
+ TEST_BATCH_SIZE: 32
413
+ NUM_WORKERS: 2
414
+ S3_ANNO_FOLDER: 's3://cc3m/'
415
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
416
+ ANNO_FILENAME: 'train_spacy.json'
417
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
418
+ S3_PATH: 's3://cc3m/'
419
+ SAMPLER: NodeDistributed
420
+ CACHE_MODE: True
421
+ CIRCULAR_CACHE_MODE: False
422
+ ZIP_MODE: False
423
+ CACHE_ORIGIN_IMAGE: False
424
+ RANDOM_CAPTION: False
425
+ AS_NUMPY_AS_POSSIBLE: False
426
+ SAMPLING_WEIGHT: 1.0
427
+ TRANSFORM: 'clip_transforms'
428
+ MODEL:
429
+ MAX_SEQ_LEN: 50
430
+ TEMP_NAME: logit_scale_retrieve
431
+ LOSSES:
432
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
433
+ LABELSMOOTHING: 0.1
434
+ LOSS_WEIGHT: 0.5
435
+ REDUCTION: 'mean'
436
+ INFERENCE:
437
+ VOCAB: 'CLIP'
438
+ GENERATION_MODE: False
439
+
440
+
441
+
442
+ -
443
+ NAME: vg_retrieve
444
+ DATASETS:
445
+ TRAIN: 'ImageTextPairDataset'
446
+ TASK_TYPE: 'image_retrieval'
447
+ DATASET_NAME: 'VG'
448
+ DATALOADER:
449
+ TRAIN_BATCH_SIZE: 64
450
+ TEST_BATCH_SIZE: 32
451
+ NUM_WORKERS: 2
452
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
453
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
454
+ S3_PATH: 's3://visual_genome/images'
455
+ ANNO_FILENAME: 'vg_captions_128filter.json'
456
+ SEQ_PER_SAMPLE: 1
457
+ CACHE_MODE: True
458
+ CIRCULAR_CACHE_MODE: False
459
+ ZIP_MODE: False
460
+ CACHE_ORIGIN_IMAGE: False
461
+ RANDOM_CAPTION: False
462
+ AS_NUMPY_AS_POSSIBLE: False
463
+ SAMPLING_WEIGHT: 1.0
464
+ TRANSFORM: 'clip_transforms'
465
+ MODEL:
466
+ MAX_SEQ_LEN: 30
467
+ TEMP_NAME: logit_scale_retrieve
468
+ LOSSES:
469
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
470
+ LABELSMOOTHING: 0.1
471
+ LOSS_WEIGHT: 0.5
472
+ REDUCTION: 'mean'
473
+ INFERENCE:
474
+ VOCAB: 'CLIP'
475
+ GENERATION_MODE: False
476
+
477
+ -
478
+ NAME: sbu_retrieve
479
+ DATASETS:
480
+ TRAIN: 'ImageTextPairDataset'
481
+ TASK_TYPE: 'image_retrieval'
482
+ DATASET_NAME: 'SBU'
483
+ DATALOADER:
484
+ TRAIN_BATCH_SIZE: 64
485
+ TEST_BATCH_SIZE: 32
486
+ NUM_WORKERS: 1
487
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
488
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
489
+ ANNO_FILENAME: 'subcaption.json'
490
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
491
+ S3_PATH: 's3://SBU/images'
492
+ SAMPLER: NodeDistributed
493
+ CACHE_MODE: True
494
+ CIRCULAR_CACHE_MODE: False
495
+ ZIP_MODE: False
496
+ CACHE_ORIGIN_IMAGE: False
497
+ RANDOM_CAPTION: False
498
+ AS_NUMPY_AS_POSSIBLE: False
499
+ SAMPLING_WEIGHT: 1.0
500
+ TRANSFORM: 'clip_transforms'
501
+ MODEL:
502
+ MAX_SEQ_LEN: 50
503
+ TEMP_NAME: logit_scale_retrieve
504
+ LOSSES:
505
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
506
+ LABELSMOOTHING: 0.1
507
+ LOSS_WEIGHT: 0.5
508
+ REDUCTION: 'mean'
509
+ INFERENCE:
510
+ VOCAB: 'CLIP'
511
+ GENERATION_MODE: False
512
+
513
+ -
514
+ NAME: flickr30k_retrieve
515
+ DATASETS:
516
+ TRAIN: 'ImageTextPairDataset'
517
+ TASK_TYPE: 'image_retrieval'
518
+ TEST: 'ImageTextPairDataset'
519
+ DATASET_NAME: 'FLICKR'
520
+ DATALOADER:
521
+ TRAIN_BATCH_SIZE: 128
522
+ TEST_BATCH_SIZE: 128
523
+ NUM_WORKERS: 2
524
+ FEATS_FOLDER: 'open_source_dataset/flickr30k_images/flickr30k_images/flickr30k_images'
525
+ ANNO_FOLDER: 'open_source_dataset/flickr30k'
526
+ S3_PATH: "s3://open_dataset/flickr30k/flickr30k_images"
527
+ SEQ_PER_SAMPLE: 1
528
+ CACHE_MODE: True
529
+ CIRCULAR_CACHE_MODE: False
530
+ ZIP_MODE: False
531
+ CACHE_ORIGIN_IMAGE: False
532
+ RANDOM_CAPTION: False
533
+ AS_NUMPY_AS_POSSIBLE: False
534
+ SAMPLING_WEIGHT: 1.0
535
+ TRANSFORM: 'clip_transforms'
536
+ MODEL:
537
+ MAX_SEQ_LEN: 77
538
+ TEMP_NAME: logit_scale_retrieve
539
+ LOSSES:
540
+ NAMES: ['LabelSmoothingCrossEntropy']
541
+ LABELSMOOTHING: 0.1
542
+ LOSS_WEIGHT: 1.0
543
+ REDUCTION: 'mean'
544
+ INFERENCE:
545
+ VOCAB: 'CLIP'
546
+ ID_KEY: 'image_id'
547
+ VALUE: 'caption'
548
+ NAME: 'RetrievalEvaler'
549
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
550
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
551
+ GENERATION_MODE: False
552
+
553
+ -
554
+ NAME: flickr30k_caption
555
+ DATASETS:
556
+ TRAIN: 'ImageTextPairDataset'
557
+ TASK_TYPE: 'image_caption'
558
+ TEST: 'ImageTextPairDataset'
559
+ DATASET_NAME: 'FLICKR'
560
+ TARGET_SET: ['Vocab_Word']
561
+ DATALOADER:
562
+ TRAIN_BATCH_SIZE: 32
563
+ TEST_BATCH_SIZE: 8
564
+ NUM_WORKERS: 4
565
+ FEATS_FOLDER: 'open_source_dataset/flickr30k_images/flickr30k_images/flickr30k_images'
566
+ ANNO_FOLDER: 'open_source_dataset/flickr30k'
567
+ S3_PATH: "s3://open_dataset/flickr30k/flickr30k_images"
568
+ SEQ_PER_SAMPLE: 1
569
+ CACHE_MODE: True
570
+ CIRCULAR_CACHE_MODE: False
571
+ ZIP_MODE: False
572
+ CACHE_ORIGIN_IMAGE: False
573
+ RANDOM_CAPTION: False
574
+ AS_NUMPY_AS_POSSIBLE: False
575
+ SAMPLING_WEIGHT: 1.0
576
+ TRANSFORM: 'clip_transforms'
577
+ TASK_TYPE: caption
578
+ # DATA_PERCENTAGE: 0.01
579
+ MODEL:
580
+ MAX_SEQ_LEN: 21
581
+ TEMP_NAME: logit_scale_caption
582
+ LOSSES:
583
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
584
+ LABELSMOOTHING: 0.1
585
+ LOSS_WEIGHT: 1.0
586
+ REDUCTION: 'mean'
587
+ DECODE_STRATEGY:
588
+ NAME: 'CaptionBeamSearcherV3'
589
+ BEAM_SIZE: 2
590
+ INFERENCE:
591
+ NAME: 'COCOEvaler'
592
+ VOCAB: 'CLIP'
593
+ ID_KEY: 'image_id'
594
+ VALUE: 'caption'
595
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/captions_val.json'
596
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/captions_test.json'
597
+ GENERATION_MODE: True
598
+
599
+
600
+ ENGINE:
601
+ NAME: 'UnifiedTrainer'
602
+
603
+ MODEL:
604
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
605
+ ENCODER: 'UnifiedBertEncoder'
606
+
607
+ IN_TUNING: True # use IN1k instead of 22k
608
+ SHARE_LAYERNORM: True
609
+ BERT:
610
+ NORMALIZE_DECISION: "BERTPre"
611
+ DROP_PATH_PROB: 0.1
612
+ NUM_HIDDEN_LAYERS: 1
613
+ DROP_PATH_PROB_FIXED: True
614
+
615
+ UNIFY_QKV: True
616
+
617
+ MODEL_EMA: False
618
+ MODEL_EMA_DECAY: 0.9999
619
+
620
+ MAEParamsInit: True
621
+ POSEMBEDFIX: True
622
+
623
+
624
+ IMG_INPUT_SIZE: 224
625
+ PATCH_SIZE: 16
626
+
627
+ LAYER_SCALE: True
628
+ LAYER_SCALE_INIT: 1e-3
629
+
630
+
631
+ DATALOADER:
632
+ USE_WEIGHTED_SAMPLER: True
633
+ UNIFIED_DATASET: True
634
+ NUM_WORKERS: 16
635
+
636
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
637
+
638
+
639
+
640
+ ####################################### Optimizer #######################################
641
+ SOLVER:
642
+ NAME: 'Adam'
643
+ TORCH_OPTIMIZER: True
644
+ PARAMS_SEPERATE: True
645
+ # PARAMS_GROUP: True
646
+ # EPOCH: 1
647
+ MAX_ITER: 150000
648
+ CHECKPOINT_PERIOD: 5000
649
+ EVAL_PERIOD: 500000
650
+ BASE_LR: 0.001
651
+ BIAS_LR_FACTOR: 1.0
652
+ WEIGHT_DECAY: 0.05
653
+ WEIGHT_DECAY_NORM: 0.0
654
+ WEIGHT_DECAY_BIAS: 0.0
655
+ WEIGHT_DECAY_EMBEDDING: 0.0
656
+ MOMENTUM: 0.9
657
+ DAMPENING: 0.0
658
+ NESTEROV: 0.0
659
+ BETAS: [0.9, 0.95]
660
+ EPS: 1e-6
661
+ GRAD_CLIP: 0.1
662
+ GRAD_CLIP_TYPE: 'norm'
663
+ ACCUM_ITER: 0
664
+ AMP_FP16: True
665
+ APEX_FP16: False # dangerous
666
+
667
+ WRITE_PERIOD: 50
668
+ MIN_LOSS_SCLE: 2048.0
669
+ # BF16: False # True
670
+ # ZEROSTAGE: 2
671
+
672
+ LOSS_SCALE_WINDOW: 200
673
+
674
+
675
+
676
+
677
+
678
+
679
+ ####################################### lr scheduler #######################################
680
+ LR_SCHEDULER:
681
+ NAME: 'WarmupCosine'
682
+ WARMUP: 5000
683
+ MIN_LR: 0.000001
684
+
685
+
686
+
687
+
688
+ ####################################### evaluation #######################################
689
+ INFERENCE:
690
+
691
+ VOCAB: 'CLIP'
692
+ ITER_BASED: True
693
+
694
+
695
+ find_unused_parameters: true
696
+
697
+ # ENCODERS:
698
+ # -
699
+ # NAME: VisualEncoder
700
+ # TYPE: VisualEncoder
701
+ # DROP_PATH_PROB: 0.0
702
+ # HIDDEN_SIZE: 192
703
+ # HIDDEN_DROPOUT_PROB: 0.
704
+ # HIDDEN_ACT: "gelu"
705
+ # NUM_ATTENTION_HEADS: 3
706
+ # INTERMEDIATE_SIZE: 768
707
+ # INTERMEDIATE_DROP: 0.
708
+ # FFN_DROPOUT_PROB: 0.
709
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
710
+ # NUM_HIDDEN_LAYERS: 6
711
+ # NUM_GENERATION_LAYERS: 0
712
+ # DROP_PATH_PROB_FIXED: True
713
+
714
+ # -
715
+ # NAME: TextEncoder
716
+ # TYPE: TextEncoder
717
+ # DROP_PATH_PROB: 0.0
718
+ # HIDDEN_SIZE: 192
719
+ # HIDDEN_DROPOUT_PROB: 0.
720
+ # HIDDEN_ACT: "gelu"
721
+ # NUM_ATTENTION_HEADS: 3
722
+ # INTERMEDIATE_SIZE: 768
723
+ # INTERMEDIATE_DROP: 0.
724
+ # FFN_DROPOUT_PROB: 0.
725
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
726
+ # NUM_HIDDEN_LAYERS: 6
727
+ # NUM_GENERATION_LAYERS: 0
728
+ # DROP_PATH_PROB_FIXED: True
729
+
configs/BERT_L12_H192_experiments/4tasks_training_small_datasets.yaml ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'small_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'small_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ TASKS:
18
+
19
+ -
20
+ NAME: imagenet
21
+ DATASETS:
22
+ TRAIN: 'ImageNetDataset'
23
+ VAL: 'ImageNetDataset'
24
+ TASK_TYPE: 'image_classification'
25
+ DATASET_NAME: 'ImageNet1k'
26
+ TARGET_SET: ['ImageNet1k']
27
+
28
+ DATALOADER:
29
+ TRAIN_BATCH_SIZE: 4
30
+ # TEST_BATCH_SIZE: 2
31
+ NUM_WORKERS: 4
32
+ FEATS_FOLDER: 'small_source_dataset/imagenet'
33
+ ANNO_FOLDER: 'small_source_dataset/imagenet/meta'
34
+ SAMPLING_WEIGHT: 1.0
35
+ MIXUP: 0.8
36
+ CUTMIX: 1.0
37
+ MIXUP_PROB: 1.0
38
+ MIXUP_SWITCH_PROB: 0.5
39
+ MIXUP_MODE: 'batch'
40
+ MIXUP_LABEL_SMOOTHING: 0.1
41
+ MODEL:
42
+ MAX_SEQ_LEN: -1
43
+ LABELS_NUM: 1000
44
+ TEMP_NAME: logit_scale_img_cls
45
+ LOSSES:
46
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
47
+ LOSS_WEIGHT: 1.0
48
+ REDUCTION: 'mean'
49
+ # LOSS_FP32: True
50
+ INFERENCE:
51
+ NAME: 'ImageNetEvaler'
52
+ ID_KEY: 'image_id'
53
+ VALUE: 'cls_logits'
54
+ VAL_ANNFILE: 'small_source_dataset/imagenet/meta/val.txt'
55
+ TEST_ANNFILE: ''
56
+ GENERATION_MODE: False
57
+
58
+ -
59
+ NAME: bookswiki_pretrain
60
+ DATASETS:
61
+ TRAIN: 'GeneralCorpusDataset'
62
+ TASK_TYPE: 'text_mlm'
63
+ DATASET_NAME: 'BooksWiki'
64
+ TARGET_SET: ['Vocab_Word']
65
+ DATALOADER:
66
+ TRAIN_BATCH_SIZE: 128
67
+ TEST_BATCH_SIZE: 32
68
+ NUM_WORKERS: 2
69
+ ANNO_FOLDER: 'small_source_dataset/bert_pretrain_data/bookswiki'
70
+ SEQ_PER_SAMPLE: 1
71
+ SAMPLER: NodeDistributed
72
+ CACHE_MODE: True
73
+ SEQ_PER_SAMPLE: 128
74
+ MIN_SEQ_PER_SAMPLE: 128
75
+ APPEND_EOS: True
76
+ ONE_STREAM: False
77
+ SAMPLING_WEIGHT: 1.0
78
+ RANDOM_MASK: True
79
+ MODEL:
80
+ MAX_SEQ_LEN: 128
81
+ TEMP_NAME: logit_scale_text_mlm
82
+ LOSSES:
83
+ NAMES: ['CrossEntropy', 'Accuracy']
84
+ LOSS_WEIGHT: 0.33333
85
+ REDUCTION: 'mean'
86
+ INFERENCE:
87
+ VOCAB: 'CLIP'
88
+ GENERATION_MODE: False
89
+
90
+ -
91
+ NAME: mscoco_caption
92
+ DATASETS:
93
+ TRAIN: 'ImageTextPairDataset'
94
+ # VAL: 'ImageTextPairDataset'
95
+ TEST: 'ImageTextPairDataset'
96
+ TASK_TYPE: 'image_caption'
97
+ DATASET_NAME: 'MSCOCO'
98
+ TARGET_SET: ['Vocab_Word']
99
+ DATALOADER:
100
+ TRAIN_BATCH_SIZE: 64
101
+ TEST_BATCH_SIZE: 32
102
+ NUM_WORKERS: 4
103
+ FEATS_FOLDER: 'small_source_dataset/mscoco_caption/coco_origin'
104
+ ANNO_FOLDER: 'small_source_dataset/mscoco_caption/annotations'
105
+ SEQ_PER_SAMPLE: 1
106
+ SAMPLING_WEIGHT: 1.0
107
+ TRANSFORM: 'clip_transforms'
108
+ RANDOM_MASK: True
109
+ MODEL:
110
+ MAX_SEQ_LEN: 50
111
+ EVAL_MAX_SEQ_LEN: 21
112
+ TEMP_NAME: logit_scale_caption
113
+ LOSSES:
114
+ NAMES: ['CrossEntropy', 'Accuracy']
115
+ LOSS_WEIGHT: 0.33333
116
+ REDUCTION: 'mean'
117
+ DECODE_STRATEGY:
118
+ NAME: 'CaptionBeamSearcherV3'
119
+ BEAM_SIZE: 2
120
+ # LEN_PENALTY: 1.0
121
+ INFERENCE:
122
+ NAME: 'COCOEvaler'
123
+ VOCAB: 'CLIP'
124
+ ID_KEY: 'image_id'
125
+ VALUE: 'caption'
126
+ VAL_ANNFILE: 'small_source_dataset/mscoco_caption/annotations/captions_val5k.json'
127
+ TEST_ANNFILE: 'small_source_dataset/mscoco_caption/annotations/captions_test5k.json'
128
+ GENERATION_MODE: True
129
+
130
+ -
131
+ NAME: mscoco_retrieve
132
+ DATASETS:
133
+ TRAIN: 'ImageTextPairDataset'
134
+ TEST: 'ImageTextPairDataset'
135
+ TASK_TYPE: 'image_retrieval'
136
+ DATASET_NAME: 'MSCOCO'
137
+ DATALOADER:
138
+ TRAIN_BATCH_SIZE: 100
139
+ TEST_BATCH_SIZE: 32
140
+ NUM_WORKERS: 1
141
+ FEATS_FOLDER: 'small_source_dataset/mscoco_caption/coco_origin'
142
+ ANNO_FOLDER: 'small_source_dataset/mscoco_caption/annotations'
143
+ SEQ_PER_SAMPLE: 1
144
+ SAMPLING_WEIGHT: 1.0
145
+ TRANSFORM: 'clip_transforms'
146
+ MODEL:
147
+ MAX_SEQ_LEN: 50
148
+ TEMP_NAME: logit_scale_retrieve
149
+ LOSSES:
150
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
151
+ LABELSMOOTHING: 0.1
152
+ LOSS_WEIGHT: 1.0
153
+ REDUCTION: 'mean'
154
+ INFERENCE:
155
+ VOCAB: 'CLIP'
156
+ ID_KEY: 'image_id'
157
+ VALUE: 'caption'
158
+ NAME: 'RetrievalEvaler'
159
+ GENERATION_MODE: False
160
+
161
+
162
+
163
+ ENGINE:
164
+ NAME: 'UnifiedTrainer'
165
+
166
+ MODEL:
167
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
168
+ ENCODER: 'UnifiedBertEncoder'
169
+
170
+ IN_TUNING: True # use IN1k instead of 22k
171
+ SHARE_LAYERNORM: True
172
+ BERT:
173
+ NORMALIZE_DECISION: "BERTPre"
174
+ DROP_PATH_PROB: 0.0
175
+ NUM_HIDDEN_LAYERS: 1
176
+ DROP_PATH_PROB_FIXED: True
177
+
178
+ UNIFY_QKV: True
179
+
180
+ MODEL_EMA: False
181
+ MODEL_EMA_DECAY: 0.9999
182
+
183
+ MAEParamsInit: True
184
+ POSEMBEDFIX: True
185
+
186
+
187
+ IMG_INPUT_SIZE: 224
188
+ PATCH_SIZE: 16
189
+
190
+ LAYER_SCALE: True
191
+ LAYER_SCALE_INIT: 1e-3
192
+
193
+
194
+ DATALOADER:
195
+ USE_WEIGHTED_SAMPLER: True
196
+ UNIFIED_DATASET: True
197
+ NUM_WORKERS: 16
198
+
199
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
200
+
201
+
202
+
203
+ ####################################### Optimizer #######################################
204
+ SOLVER:
205
+ NAME: 'Adam'
206
+ TORCH_OPTIMIZER: True
207
+ PARAMS_SEPERATE: True
208
+ # PARAMS_GROUP: True
209
+ # EPOCH: 1
210
+ MAX_ITER: 150000
211
+ CHECKPOINT_PERIOD: 5000
212
+ EVAL_PERIOD: 500000
213
+ BASE_LR: 0.001
214
+ BIAS_LR_FACTOR: 1.0
215
+ WEIGHT_DECAY: 0.05
216
+ WEIGHT_DECAY_NORM: 0.0
217
+ WEIGHT_DECAY_BIAS: 0.0
218
+ WEIGHT_DECAY_EMBEDDING: 0.0
219
+ MOMENTUM: 0.9
220
+ DAMPENING: 0.0
221
+ NESTEROV: 0.0
222
+ BETAS: [0.9, 0.95]
223
+ EPS: 1e-6
224
+ GRAD_CLIP: 0.1
225
+ GRAD_CLIP_TYPE: 'norm'
226
+ ACCUM_ITER: 0
227
+ AMP_FP16: True
228
+ APEX_FP16: False # dangerous
229
+
230
+ WRITE_PERIOD: 50
231
+ MIN_LOSS_SCLE: 2048.0
232
+ # BF16: False # True
233
+ # ZEROSTAGE: 2
234
+
235
+ LOSS_SCALE_WINDOW: 200
236
+
237
+
238
+
239
+
240
+
241
+
242
+ ####################################### lr scheduler #######################################
243
+ LR_SCHEDULER:
244
+ NAME: 'WarmupCosine'
245
+ WARMUP: 5000
246
+ MIN_LR: 0.000001
247
+
248
+
249
+
250
+
251
+ ####################################### evaluation #######################################
252
+ INFERENCE:
253
+
254
+ VOCAB: 'CLIP'
255
+ ITER_BASED: True
256
+
257
+
258
+ find_unused_parameters: true
259
+
260
+ # ENCODERS:
261
+ # -
262
+ # NAME: VisualEncoder
263
+ # TYPE: VisualEncoder
264
+ # DROP_PATH_PROB: 0.0
265
+ # HIDDEN_SIZE: 192
266
+ # HIDDEN_DROPOUT_PROB: 0.
267
+ # HIDDEN_ACT: "gelu"
268
+ # NUM_ATTENTION_HEADS: 3
269
+ # INTERMEDIATE_SIZE: 768
270
+ # INTERMEDIATE_DROP: 0.
271
+ # FFN_DROPOUT_PROB: 0.
272
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
273
+ # NUM_HIDDEN_LAYERS: 6
274
+ # NUM_GENERATION_LAYERS: 0
275
+ # DROP_PATH_PROB_FIXED: True
276
+
277
+ # -
278
+ # NAME: TextEncoder
279
+ # TYPE: TextEncoder
280
+ # DROP_PATH_PROB: 0.0
281
+ # HIDDEN_SIZE: 192
282
+ # HIDDEN_DROPOUT_PROB: 0.
283
+ # HIDDEN_ACT: "gelu"
284
+ # NUM_ATTENTION_HEADS: 3
285
+ # INTERMEDIATE_SIZE: 768
286
+ # INTERMEDIATE_DROP: 0.
287
+ # FFN_DROPOUT_PROB: 0.
288
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
289
+ # NUM_HIDDEN_LAYERS: 6
290
+ # NUM_GENERATION_LAYERS: 0
291
+ # DROP_PATH_PROB_FIXED: True
292
+
configs/BERT_L12_H192_experiments/7tasks_berttiny_training.yaml ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+
18
+
19
+ TASKS:
20
+
21
+ -
22
+ NAME: imagenet
23
+ DATASETS:
24
+ TRAIN: 'ImageNetDataset'
25
+ VAL: 'ImageNetDataset'
26
+ TASK_TYPE: 'image_classification'
27
+ DATASET_NAME: 'ImageNet1k'
28
+ TARGET_SET: ['ImageNet1k']
29
+
30
+ DATALOADER:
31
+ TRAIN_BATCH_SIZE: 720
32
+ TEST_BATCH_SIZE: 256
33
+ NUM_WORKERS: 4
34
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
35
+ S3_PATH: 'cluster2:s3://imagenet'
36
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
37
+ SAMPLING_WEIGHT: 2.5
38
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
39
+ MIXUP: 0.8
40
+ CUTMIX: 1.0
41
+ MIXUP_PROB: 1.0
42
+ MIXUP_SWITCH_PROB: 0.5
43
+ MIXUP_MODE: 'batch'
44
+ MIXUP_LABEL_SMOOTHING: 0.1
45
+ MODEL:
46
+ MAX_SEQ_LEN: -1
47
+ LABELS_NUM: 1000
48
+ TEMP_NAME: logit_scale_img_cls
49
+ LOSSES:
50
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
51
+ LOSS_WEIGHT: 1.0
52
+ REDUCTION: 'mean'
53
+ # LOSS_FP32: True
54
+ INFERENCE:
55
+ NAME: 'ImageNetEvaler'
56
+ ID_KEY: 'image_id'
57
+ VALUE: 'cls_logits'
58
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
59
+ TEST_ANNFILE: ''
60
+ GENERATION_MODE: False
61
+
62
+ -
63
+ NAME: bookswiki_pretrain
64
+ DATASETS:
65
+ TRAIN: 'GeneralCorpusDataset'
66
+ TASK_TYPE: 'text_mlm'
67
+ DATASET_NAME: 'BooksWiki'
68
+ TARGET_SET: ['Vocab_Word']
69
+ VERSION: 'v2'
70
+ DATALOADER:
71
+ TRAIN_BATCH_SIZE: 512
72
+ TEST_BATCH_SIZE: 32
73
+ NUM_WORKERS: 2
74
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
75
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
76
+ SEQ_PER_SAMPLE: 1
77
+ SAMPLER: NodeDistributed
78
+ CACHE_MODE: True
79
+ SEQ_PER_SAMPLE: 128
80
+ MIN_SEQ_PER_SAMPLE: 128
81
+ APPEND_EOS: True
82
+ ONE_STREAM: False
83
+ SAMPLING_WEIGHT: 3.5
84
+ RANDOM_MASK: True
85
+ MODEL:
86
+ MAX_SEQ_LEN: 128
87
+ TEMP_NAME: logit_scale_text_mlm
88
+ LOSSES:
89
+ NAMES: ['CrossEntropy', 'Accuracy']
90
+ LOSS_WEIGHT: 0.33333
91
+ REDUCTION: 'mean'
92
+ INFERENCE:
93
+ VOCAB: 'CLIP'
94
+ GENERATION_MODE: False
95
+
96
+ ########## Image Captioning ###########
97
+
98
+
99
+ -
100
+ NAME: cc12m_caption
101
+ DATASETS:
102
+ TRAIN: 'ImageTextPairDataset'
103
+ TASK_TYPE: 'image_caption'
104
+ DATASET_NAME: 'CC12M'
105
+ TARGET_SET: ['Vocab_Word']
106
+ DATALOADER:
107
+ TRAIN_BATCH_SIZE: 300
108
+ TEST_BATCH_SIZE: 32
109
+ NUM_WORKERS: 2
110
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
111
+ ANNO_FILENAME: 'train_available.json'
112
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
113
+ S3_PATH: 's3://cc12m/'
114
+ SEQ_PER_SAMPLE: 1
115
+ SAMPLER: NodeDistributed
116
+ CACHE_MODE: True
117
+ CIRCULAR_CACHE_MODE: False
118
+ ZIP_MODE: False
119
+ CACHE_ORIGIN_IMAGE: False
120
+ RANDOM_CAPTION: False
121
+ AS_NUMPY_AS_POSSIBLE: False
122
+ SAMPLING_WEIGHT: 1.6889
123
+ TRANSFORM: 'clip_transforms'
124
+ MODEL:
125
+ MAX_SEQ_LEN: 50
126
+ TEMP_NAME: logit_scale_caption
127
+ LOSSES:
128
+ NAMES: ['CrossEntropy', 'Accuracy']
129
+ LOSS_WEIGHT: 0.33333
130
+ REDUCTION: 'mean'
131
+ INFERENCE:
132
+ VOCAB: 'CLIP'
133
+ GENERATION_MODE: False
134
+
135
+ -
136
+ NAME: cc3m_caption
137
+ DATASETS:
138
+ TRAIN: 'ImageTextPairDataset'
139
+ TASK_TYPE: 'image_caption'
140
+ DATASET_NAME: 'CC3M'
141
+ TARGET_SET: ['Vocab_Word']
142
+ DATALOADER:
143
+ TRAIN_BATCH_SIZE: 300
144
+ TEST_BATCH_SIZE: 32
145
+ NUM_WORKERS: 2
146
+ ANNO_FOLDER: 's3://cc3m/'
147
+ ANNO_FILENAME: 'train_spacy.json'
148
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
149
+ S3_PATH: 's3://cc3m/'
150
+ SEQ_PER_SAMPLE: 1
151
+ SAMPLER: NodeDistributed
152
+ CACHE_MODE: True
153
+ CIRCULAR_CACHE_MODE: False
154
+ ZIP_MODE: False
155
+ CACHE_ORIGIN_IMAGE: False
156
+ RANDOM_CAPTION: False
157
+ AS_NUMPY_AS_POSSIBLE: False
158
+ SAMPLING_WEIGHT: 0.8780
159
+ TRANSFORM: 'clip_transforms'
160
+ MODEL:
161
+ MAX_SEQ_LEN: 50
162
+ TEMP_NAME: logit_scale_caption
163
+ LOSSES:
164
+ NAMES: ['CrossEntropy', 'Accuracy']
165
+ LOSS_WEIGHT: 0.33333
166
+ REDUCTION: 'mean'
167
+ INFERENCE:
168
+ VOCAB: 'CLIP'
169
+ GENERATION_MODE: False
170
+
171
+ -
172
+ NAME: vg_caption
173
+ DATASETS:
174
+ TRAIN: 'ImageTextPairDataset'
175
+ TASK_TYPE: 'image_caption'
176
+ DATASET_NAME: 'VG'
177
+ TARGET_SET: ['Vocab_Word']
178
+ DATALOADER:
179
+ TRAIN_BATCH_SIZE: 300
180
+ TEST_BATCH_SIZE: 32
181
+ NUM_WORKERS: 2
182
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
183
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
184
+ S3_PATH: 's3://visual_genome/images'
185
+ ANNO_FILENAME: 'vg_captions_128filter.json'
186
+ SEQ_PER_SAMPLE: 1
187
+ CACHE_MODE: True
188
+ CIRCULAR_CACHE_MODE: False
189
+ ZIP_MODE: False
190
+ CACHE_ORIGIN_IMAGE: False
191
+ RANDOM_CAPTION: False
192
+ AS_NUMPY_AS_POSSIBLE: False
193
+ SAMPLING_WEIGHT: 0.5895
194
+ TRANSFORM: 'clip_transforms'
195
+ MODEL:
196
+ MAX_SEQ_LEN: 30
197
+ TEMP_NAME: logit_scale_caption
198
+ LOSSES:
199
+ NAMES: ['CrossEntropy', 'Accuracy']
200
+ LOSS_WEIGHT: 0.33333
201
+ REDUCTION: 'mean'
202
+ INFERENCE:
203
+ VOCAB: 'CLIP'
204
+ GENERATION_MODE: True
205
+
206
+ -
207
+ NAME: mscoco_caption
208
+ DATASETS:
209
+ TRAIN: 'ImageTextPairDataset'
210
+ # VAL: 'ImageTextPairDataset'
211
+ TEST: 'ImageTextPairDataset'
212
+ TASK_TYPE: 'image_caption'
213
+ DATASET_NAME: 'MSCOCO'
214
+ TARGET_SET: ['Vocab_Word']
215
+ DATALOADER:
216
+ TRAIN_BATCH_SIZE: 300
217
+ TEST_BATCH_SIZE: 32
218
+ NUM_WORKERS: 4
219
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
220
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
221
+ S3_PATH: 's3://coco/'
222
+ SEQ_PER_SAMPLE: 1
223
+ CACHE_MODE: True
224
+ CIRCULAR_CACHE_MODE: False
225
+ ZIP_MODE: False
226
+ CACHE_ORIGIN_IMAGE: False
227
+ RANDOM_CAPTION: False
228
+ AS_NUMPY_AS_POSSIBLE: False
229
+ SAMPLING_WEIGHT: 0.3817
230
+ TRANSFORM: 'clip_transforms'
231
+ RANDOM_MASK: True
232
+ MODEL:
233
+ MAX_SEQ_LEN: 50
234
+ EVAL_MAX_SEQ_LEN: 21
235
+ TEMP_NAME: logit_scale_caption
236
+ LOSSES:
237
+ NAMES: ['CrossEntropy', 'Accuracy']
238
+ LOSS_WEIGHT: 0.33333
239
+ REDUCTION: 'mean'
240
+ DECODE_STRATEGY:
241
+ NAME: 'CaptionBeamSearcherV3'
242
+ BEAM_SIZE: 2
243
+ # LEN_PENALTY: 1.0
244
+ INFERENCE:
245
+ NAME: 'COCOEvaler'
246
+ VOCAB: 'CLIP'
247
+ ID_KEY: 'image_id'
248
+ VALUE: 'caption'
249
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
250
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
251
+ GENERATION_MODE: True
252
+
253
+ -
254
+ NAME: sbu_caption
255
+ DATASETS:
256
+ TRAIN: 'ImageTextPairDataset'
257
+ TASK_TYPE: 'image_caption'
258
+ DATASET_NAME: 'SBU'
259
+ TARGET_SET: ['Vocab_Word']
260
+ DATALOADER:
261
+ TRAIN_BATCH_SIZE: 300
262
+ TEST_BATCH_SIZE: 32
263
+ NUM_WORKERS: 1
264
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
265
+ ANNO_FILENAME: 'subcaption.json'
266
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
267
+ S3_PATH: 's3://SBU/images'
268
+ SEQ_PER_SAMPLE: 1
269
+ SAMPLER: NodeDistributed
270
+ CACHE_MODE: True
271
+ CIRCULAR_CACHE_MODE: False
272
+ ZIP_MODE: False
273
+ CACHE_ORIGIN_IMAGE: False
274
+ RANDOM_CAPTION: False
275
+ AS_NUMPY_AS_POSSIBLE: False
276
+ SAMPLING_WEIGHT: 0.4618
277
+ TRANSFORM: 'clip_transforms'
278
+ MODEL:
279
+ MAX_SEQ_LEN: 50
280
+ TEMP_NAME: logit_scale_caption
281
+ LOSSES:
282
+ NAMES: ['CrossEntropy', 'Accuracy']
283
+ LOSS_WEIGHT: 0.33333
284
+ REDUCTION: 'mean'
285
+ INFERENCE:
286
+ VOCAB: 'CLIP'
287
+ GENERATION_MODE: False
288
+
289
+
290
+ ENGINE:
291
+ NAME: 'UnifiedTrainer'
292
+
293
+ MODEL:
294
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
295
+ ENCODER: 'UnifiedBertEncoder'
296
+
297
+ IN_TUNING: True # use IN1k instead of 22k
298
+ SHARE_LAYERNORM: True
299
+ BERT:
300
+ NORMALIZE_DECISION: "BERTPre"
301
+ DROP_PATH_PROB: 0.0
302
+ DROP_PATH_PROB_FIXED: True
303
+
304
+ MODEL_EMA: False
305
+ MODEL_EMA_DECAY: 0.9999
306
+
307
+ MAEParamsInit: True
308
+ POSEMBEDFIX: True
309
+
310
+
311
+ IMG_INPUT_SIZE: 224
312
+ PATCH_SIZE: 16
313
+
314
+ LAYER_SCALE: True
315
+ LAYER_SCALE_INIT: 1e-3
316
+
317
+
318
+ DATALOADER:
319
+ USE_WEIGHTED_SAMPLER: True
320
+ UNIFIED_DATASET: True
321
+ NUM_WORKERS: 32
322
+
323
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
324
+
325
+
326
+
327
+ ####################################### Optimizer #######################################
328
+ SOLVER:
329
+ NAME: 'Adam'
330
+ TORCH_OPTIMIZER: True
331
+ PARAMS_SEPERATE: True
332
+ # PARAMS_GROUP: True
333
+ # EPOCH: 1
334
+ MAX_ITER: 150000
335
+ CHECKPOINT_PERIOD: 5000
336
+ EVAL_PERIOD: 500000
337
+ BASE_LR: 0.001
338
+ BIAS_LR_FACTOR: 1.0
339
+ WEIGHT_DECAY: 0.05
340
+ WEIGHT_DECAY_NORM: 0.0
341
+ WEIGHT_DECAY_BIAS: 0.0
342
+ WEIGHT_DECAY_EMBEDDING: 0.0
343
+ MOMENTUM: 0.9
344
+ DAMPENING: 0.0
345
+ NESTEROV: 0.0
346
+ BETAS: [0.9, 0.95]
347
+ EPS: 1e-6
348
+ GRAD_CLIP: 0.1
349
+ GRAD_CLIP_TYPE: 'norm'
350
+ ACCUM_ITER: 0
351
+ AMP_FP16: True
352
+ APEX_FP16: False # dangerous
353
+
354
+ WRITE_PERIOD: 50
355
+ MIN_LOSS_SCLE: 2048.0
356
+ # BF16: False # True
357
+ # ZEROSTAGE: 2
358
+
359
+ LOSS_SCALE_WINDOW: 200
360
+
361
+
362
+
363
+
364
+
365
+
366
+ ####################################### lr scheduler #######################################
367
+ LR_SCHEDULER:
368
+ NAME: 'WarmupCosine'
369
+ WARMUP: 5000
370
+ MIN_LR: 0.000001
371
+
372
+
373
+
374
+
375
+ ####################################### evaluation #######################################
376
+ INFERENCE:
377
+
378
+ VOCAB: 'CLIP'
379
+ ITER_BASED: True
380
+
381
+
382
+ find_unused_parameters: true
383
+
384
+ # ENCODERS:
385
+ # -
386
+ # NAME: VisualEncoder
387
+ # TYPE: VisualEncoder
388
+ # DROP_PATH_PROB: 0.0
389
+ # HIDDEN_SIZE: 192
390
+ # HIDDEN_DROPOUT_PROB: 0.
391
+ # HIDDEN_ACT: "gelu"
392
+ # NUM_ATTENTION_HEADS: 3
393
+ # INTERMEDIATE_SIZE: 768
394
+ # INTERMEDIATE_DROP: 0.
395
+ # FFN_DROPOUT_PROB: 0.
396
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
397
+ # NUM_HIDDEN_LAYERS: 6
398
+ # NUM_GENERATION_LAYERS: 0
399
+ # DROP_PATH_PROB_FIXED: True
400
+
401
+ # -
402
+ # NAME: TextEncoder
403
+ # TYPE: TextEncoder
404
+ # DROP_PATH_PROB: 0.0
405
+ # HIDDEN_SIZE: 192
406
+ # HIDDEN_DROPOUT_PROB: 0.
407
+ # HIDDEN_ACT: "gelu"
408
+ # NUM_ATTENTION_HEADS: 3
409
+ # INTERMEDIATE_SIZE: 768
410
+ # INTERMEDIATE_DROP: 0.
411
+ # FFN_DROPOUT_PROB: 0.
412
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
413
+ # NUM_HIDDEN_LAYERS: 6
414
+ # NUM_GENERATION_LAYERS: 0
415
+ # DROP_PATH_PROB_FIXED: True
416
+
configs/BERT_L12_H192_experiments/7tasks_berttiny_training_apex_o2.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "7tasks_berttiny_training.yaml"
2
+
3
+ ####################################### Optimizer #######################################
4
+ SOLVER:
5
+
6
+ AMP_FP16: False
7
+ APEX_FP16: True # dangerous
8
+ APEX_OPT_LEVEL: 'O2'
9
+ CHECKPOINT_PERIOD: 100000
configs/BERT_L12_H192_experiments/7tasks_berttiny_training_lamb.yaml ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+
18
+
19
+ TASKS:
20
+
21
+ -
22
+ NAME: imagenet
23
+ DATASETS:
24
+ TRAIN: 'ImageNetDataset'
25
+ VAL: 'ImageNetDataset'
26
+ TASK_TYPE: 'image_classification'
27
+ DATASET_NAME: 'ImageNet1k'
28
+ TARGET_SET: ['ImageNet1k']
29
+
30
+ DATALOADER:
31
+ TRAIN_BATCH_SIZE: 720
32
+ TEST_BATCH_SIZE: 256
33
+ NUM_WORKERS: 4
34
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
35
+ S3_PATH: 'cluster2:s3://imagenet'
36
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
37
+ SAMPLING_WEIGHT: 2.5
38
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
39
+ MIXUP: 0.8
40
+ CUTMIX: 1.0
41
+ MIXUP_PROB: 1.0
42
+ MIXUP_SWITCH_PROB: 0.5
43
+ MIXUP_MODE: 'batch'
44
+ MIXUP_LABEL_SMOOTHING: 0.1
45
+ MODEL:
46
+ MAX_SEQ_LEN: -1
47
+ LABELS_NUM: 1000
48
+ TEMP_NAME: logit_scale_img_cls
49
+ LOSSES:
50
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
51
+ LOSS_WEIGHT: 1.0
52
+ REDUCTION: 'mean'
53
+ # LOSS_FP32: True
54
+ INFERENCE:
55
+ NAME: 'ImageNetEvaler'
56
+ ID_KEY: 'image_id'
57
+ VALUE: 'cls_logits'
58
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
59
+ TEST_ANNFILE: ''
60
+ GENERATION_MODE: False
61
+
62
+ -
63
+ NAME: bookswiki_pretrain
64
+ DATASETS:
65
+ TRAIN: 'GeneralCorpusDataset'
66
+ TASK_TYPE: 'text_mlm'
67
+ DATASET_NAME: 'BooksWiki'
68
+ TARGET_SET: ['Vocab_Word']
69
+ VERSION: 'v2'
70
+ DATALOADER:
71
+ TRAIN_BATCH_SIZE: 512
72
+ TEST_BATCH_SIZE: 32
73
+ NUM_WORKERS: 2
74
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
75
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
76
+ SEQ_PER_SAMPLE: 1
77
+ SAMPLER: NodeDistributed
78
+ CACHE_MODE: True
79
+ SEQ_PER_SAMPLE: 128
80
+ MIN_SEQ_PER_SAMPLE: 128
81
+ APPEND_EOS: True
82
+ ONE_STREAM: False
83
+ SAMPLING_WEIGHT: 3.5
84
+ RANDOM_MASK: True
85
+ MODEL:
86
+ MAX_SEQ_LEN: 128
87
+ TEMP_NAME: logit_scale_text_mlm
88
+ LOSSES:
89
+ NAMES: ['CrossEntropy', 'Accuracy']
90
+ LOSS_WEIGHT: 0.33333
91
+ REDUCTION: 'mean'
92
+ INFERENCE:
93
+ VOCAB: 'CLIP'
94
+ GENERATION_MODE: False
95
+
96
+ ########## Image Captioning ###########
97
+
98
+
99
+ -
100
+ NAME: cc12m_caption
101
+ DATASETS:
102
+ TRAIN: 'ImageTextPairDataset'
103
+ TASK_TYPE: 'image_caption'
104
+ DATASET_NAME: 'CC12M'
105
+ TARGET_SET: ['Vocab_Word']
106
+ DATALOADER:
107
+ TRAIN_BATCH_SIZE: 300
108
+ TEST_BATCH_SIZE: 32
109
+ NUM_WORKERS: 2
110
+ S3_ANNO_FOLDER: 's3://cc12m/'
111
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
112
+ ANNO_FILENAME: 'train_available.json'
113
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
114
+ S3_PATH: 's3://cc12m/'
115
+ SEQ_PER_SAMPLE: 1
116
+ SAMPLER: NodeDistributed
117
+ CACHE_MODE: True
118
+ CIRCULAR_CACHE_MODE: False
119
+ ZIP_MODE: False
120
+ CACHE_ORIGIN_IMAGE: False
121
+ RANDOM_CAPTION: False
122
+ AS_NUMPY_AS_POSSIBLE: False
123
+ SAMPLING_WEIGHT: 1.6889
124
+ TRANSFORM: 'clip_transforms'
125
+ MODEL:
126
+ MAX_SEQ_LEN: 50
127
+ TEMP_NAME: logit_scale_caption
128
+ LOSSES:
129
+ NAMES: ['CrossEntropy', 'Accuracy']
130
+ LOSS_WEIGHT: 0.33333
131
+ REDUCTION: 'mean'
132
+ INFERENCE:
133
+ VOCAB: 'CLIP'
134
+ GENERATION_MODE: False
135
+
136
+ -
137
+ NAME: cc3m_caption
138
+ DATASETS:
139
+ TRAIN: 'ImageTextPairDataset'
140
+ TASK_TYPE: 'image_caption'
141
+ DATASET_NAME: 'CC3M'
142
+ TARGET_SET: ['Vocab_Word']
143
+ DATALOADER:
144
+ TRAIN_BATCH_SIZE: 300
145
+ TEST_BATCH_SIZE: 32
146
+ NUM_WORKERS: 2
147
+ ANNO_FOLDER: 's3://cc3m/'
148
+ ANNO_FILENAME: 'train_spacy.json'
149
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
150
+ S3_PATH: 's3://cc3m/'
151
+ SEQ_PER_SAMPLE: 1
152
+ SAMPLER: NodeDistributed
153
+ CACHE_MODE: True
154
+ CIRCULAR_CACHE_MODE: False
155
+ ZIP_MODE: False
156
+ CACHE_ORIGIN_IMAGE: False
157
+ RANDOM_CAPTION: False
158
+ AS_NUMPY_AS_POSSIBLE: False
159
+ SAMPLING_WEIGHT: 0.8780
160
+ TRANSFORM: 'clip_transforms'
161
+ MODEL:
162
+ MAX_SEQ_LEN: 50
163
+ TEMP_NAME: logit_scale_caption
164
+ LOSSES:
165
+ NAMES: ['CrossEntropy', 'Accuracy']
166
+ LOSS_WEIGHT: 0.33333
167
+ REDUCTION: 'mean'
168
+ INFERENCE:
169
+ VOCAB: 'CLIP'
170
+ GENERATION_MODE: False
171
+
172
+ -
173
+ NAME: vg_caption
174
+ DATASETS:
175
+ TRAIN: 'ImageTextPairDataset'
176
+ TASK_TYPE: 'image_caption'
177
+ DATASET_NAME: 'VG'
178
+ TARGET_SET: ['Vocab_Word']
179
+ DATALOADER:
180
+ TRAIN_BATCH_SIZE: 300
181
+ TEST_BATCH_SIZE: 32
182
+ NUM_WORKERS: 2
183
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
184
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
185
+ S3_PATH: 's3://visual_genome/images'
186
+ ANNO_FILENAME: 'vg_captions_128filter.json'
187
+ SEQ_PER_SAMPLE: 1
188
+ CACHE_MODE: True
189
+ CIRCULAR_CACHE_MODE: False
190
+ ZIP_MODE: False
191
+ CACHE_ORIGIN_IMAGE: False
192
+ RANDOM_CAPTION: False
193
+ AS_NUMPY_AS_POSSIBLE: False
194
+ SAMPLING_WEIGHT: 0.5895
195
+ TRANSFORM: 'clip_transforms'
196
+ MODEL:
197
+ MAX_SEQ_LEN: 30
198
+ TEMP_NAME: logit_scale_caption
199
+ LOSSES:
200
+ NAMES: ['CrossEntropy', 'Accuracy']
201
+ LOSS_WEIGHT: 0.33333
202
+ REDUCTION: 'mean'
203
+ INFERENCE:
204
+ VOCAB: 'CLIP'
205
+ GENERATION_MODE: True
206
+
207
+ -
208
+ NAME: mscoco_caption
209
+ DATASETS:
210
+ TRAIN: 'ImageTextPairDataset'
211
+ # VAL: 'ImageTextPairDataset'
212
+ TEST: 'ImageTextPairDataset'
213
+ TASK_TYPE: 'image_caption'
214
+ DATASET_NAME: 'MSCOCO'
215
+ TARGET_SET: ['Vocab_Word']
216
+ DATALOADER:
217
+ TRAIN_BATCH_SIZE: 300
218
+ TEST_BATCH_SIZE: 32
219
+ NUM_WORKERS: 4
220
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
221
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
222
+ S3_PATH: 's3://coco/'
223
+ SEQ_PER_SAMPLE: 1
224
+ CACHE_MODE: True
225
+ CIRCULAR_CACHE_MODE: False
226
+ ZIP_MODE: False
227
+ CACHE_ORIGIN_IMAGE: False
228
+ RANDOM_CAPTION: False
229
+ AS_NUMPY_AS_POSSIBLE: False
230
+ SAMPLING_WEIGHT: 0.3817
231
+ TRANSFORM: 'clip_transforms'
232
+ RANDOM_MASK: True
233
+ MODEL:
234
+ MAX_SEQ_LEN: 50
235
+ EVAL_MAX_SEQ_LEN: 21
236
+ TEMP_NAME: logit_scale_caption
237
+ LOSSES:
238
+ NAMES: ['CrossEntropy', 'Accuracy']
239
+ LOSS_WEIGHT: 0.33333
240
+ REDUCTION: 'mean'
241
+ DECODE_STRATEGY:
242
+ NAME: 'CaptionBeamSearcherV3'
243
+ BEAM_SIZE: 2
244
+ # LEN_PENALTY: 1.0
245
+ INFERENCE:
246
+ NAME: 'COCOEvaler'
247
+ VOCAB: 'CLIP'
248
+ ID_KEY: 'image_id'
249
+ VALUE: 'caption'
250
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
251
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
252
+ GENERATION_MODE: True
253
+
254
+ -
255
+ NAME: sbu_caption
256
+ DATASETS:
257
+ TRAIN: 'ImageTextPairDataset'
258
+ TASK_TYPE: 'image_caption'
259
+ DATASET_NAME: 'SBU'
260
+ TARGET_SET: ['Vocab_Word']
261
+ DATALOADER:
262
+ TRAIN_BATCH_SIZE: 300
263
+ TEST_BATCH_SIZE: 32
264
+ NUM_WORKERS: 1
265
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
266
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
267
+ ANNO_FILENAME: 'subcaption.json'
268
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
269
+ S3_PATH: 's3://SBU/images'
270
+ SEQ_PER_SAMPLE: 1
271
+ SAMPLER: NodeDistributed
272
+ CACHE_MODE: True
273
+ CIRCULAR_CACHE_MODE: False
274
+ ZIP_MODE: False
275
+ CACHE_ORIGIN_IMAGE: False
276
+ RANDOM_CAPTION: False
277
+ AS_NUMPY_AS_POSSIBLE: False
278
+ SAMPLING_WEIGHT: 0.4618
279
+ TRANSFORM: 'clip_transforms'
280
+ MODEL:
281
+ MAX_SEQ_LEN: 50
282
+ TEMP_NAME: logit_scale_caption
283
+ LOSSES:
284
+ NAMES: ['CrossEntropy', 'Accuracy']
285
+ LOSS_WEIGHT: 0.33333
286
+ REDUCTION: 'mean'
287
+ INFERENCE:
288
+ VOCAB: 'CLIP'
289
+ GENERATION_MODE: False
290
+
291
+
292
+ ENGINE:
293
+ NAME: 'UnifiedTrainer'
294
+
295
+ MODEL:
296
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
297
+ ENCODER: 'UnifiedBertEncoder'
298
+
299
+ IN_TUNING: True # use IN1k instead of 22k
300
+ SHARE_LAYERNORM: True
301
+ BERT:
302
+ NORMALIZE_DECISION: "BERTPre"
303
+ DROP_PATH_PROB: 0.0
304
+ DROP_PATH_PROB_FIXED: True
305
+
306
+ MODEL_EMA: False
307
+ MODEL_EMA_DECAY: 0.9999
308
+
309
+ MAEParamsInit: True
310
+ POSEMBEDFIX: True
311
+
312
+
313
+ IMG_INPUT_SIZE: 224
314
+ PATCH_SIZE: 16
315
+
316
+ LAYER_SCALE: True
317
+ LAYER_SCALE_INIT: 1e-3
318
+
319
+
320
+ DATALOADER:
321
+ USE_WEIGHTED_SAMPLER: True
322
+ UNIFIED_DATASET: True
323
+ NUM_WORKERS: 32
324
+
325
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
326
+
327
+
328
+
329
+ ####################################### Optimizer #######################################
330
+ SOLVER:
331
+ NAME: 'LAMB'
332
+ TORCH_OPTIMIZER: True
333
+ PARAMS_SEPERATE: True
334
+ # PARAMS_GROUP: True
335
+ # EPOCH: 1
336
+ MAX_ITER: 150000
337
+ CHECKPOINT_PERIOD: 5000
338
+ EVAL_PERIOD: 500000
339
+ BASE_LR: 0.01
340
+ BIAS_LR_FACTOR: 1.0
341
+ WEIGHT_DECAY: 0.05
342
+ WEIGHT_DECAY_NORM: 0.0
343
+ WEIGHT_DECAY_BIAS: 0.0
344
+ WEIGHT_DECAY_EMBEDDING: 0.0
345
+ MOMENTUM: 0.9
346
+ DAMPENING: 0.0
347
+ NESTEROV: 0.0
348
+ BETAS: [0.9, 0.95]
349
+ EPS: 1e-6
350
+ GRAD_CLIP: 0.1
351
+ GRAD_CLIP_TYPE: 'norm'
352
+ ACCUM_ITER: 0
353
+ AMP_FP16: True
354
+ APEX_FP16: False # dangerous
355
+
356
+ WRITE_PERIOD: 50
357
+ MIN_LOSS_SCLE: 2048.0
358
+ # BF16: False # True
359
+ # ZEROSTAGE: 2
360
+
361
+ LOSS_SCALE_WINDOW: 200
362
+
363
+
364
+
365
+
366
+
367
+
368
+ ####################################### lr scheduler #######################################
369
+ LR_SCHEDULER:
370
+ NAME: 'WarmupCosine'
371
+ WARMUP: 5000
372
+ MIN_LR: 0.000001
373
+
374
+
375
+
376
+
377
+ ####################################### evaluation #######################################
378
+ INFERENCE:
379
+
380
+ VOCAB: 'CLIP'
381
+ ITER_BASED: True
382
+
383
+
384
+ find_unused_parameters: true
385
+
386
+ # ENCODERS:
387
+ # -
388
+ # NAME: VisualEncoder
389
+ # TYPE: VisualEncoder
390
+ # DROP_PATH_PROB: 0.0
391
+ # HIDDEN_SIZE: 192
392
+ # HIDDEN_DROPOUT_PROB: 0.
393
+ # HIDDEN_ACT: "gelu"
394
+ # NUM_ATTENTION_HEADS: 3
395
+ # INTERMEDIATE_SIZE: 768
396
+ # INTERMEDIATE_DROP: 0.
397
+ # FFN_DROPOUT_PROB: 0.
398
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
399
+ # NUM_HIDDEN_LAYERS: 6
400
+ # NUM_GENERATION_LAYERS: 0
401
+ # DROP_PATH_PROB_FIXED: True
402
+
403
+ # -
404
+ # NAME: TextEncoder
405
+ # TYPE: TextEncoder
406
+ # DROP_PATH_PROB: 0.0
407
+ # HIDDEN_SIZE: 192
408
+ # HIDDEN_DROPOUT_PROB: 0.
409
+ # HIDDEN_ACT: "gelu"
410
+ # NUM_ATTENTION_HEADS: 3
411
+ # INTERMEDIATE_SIZE: 768
412
+ # INTERMEDIATE_DROP: 0.
413
+ # FFN_DROPOUT_PROB: 0.
414
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
415
+ # NUM_HIDDEN_LAYERS: 6
416
+ # NUM_GENERATION_LAYERS: 0
417
+ # DROP_PATH_PROB_FIXED: True
418
+
configs/BERT_L12_H192_experiments/7tasks_berttiny_training_moe.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "7tasks_berttiny_training.yaml"
2
+
3
+ MOE:
4
+ MOE: True
5
+ MOE_TYPE: 'attribute'
6
+ TAG_Transform: True
7
+ ATTRIBUTE_LENGTH: 8
8
+ EP_WORLD_SIZE: 1 # tag moe only
9
+ NUM_EXPERTS: 8
10
+ TOP_K: 2
11
+ CAPACITY_FACTOR: 3.0
12
+ EVAL_MIN_CAPACITY: 4.0
13
+ MIN_CAPACITY: 4
14
+ NOISY_GATE_POLICY: 'vmoe'
15
+ MOE_PARAM_GROUP: True
16
+ MOE_EXPERT_TYPE: 'FFN,SA'
17
+ SA_LINEAR_OUT_MOE: True
18
+ MOE_EXPERT_LOCATION: 'all' # 'odd'
19
+ # MOE_LAYER_START_IDX: 3
20
+ # MOE_LAYER_END_IDX: 21
21
+ # MOE_LAYER_START_IDX: 18
22
+ # MOE_LAYER_END_IDX: 12
23
+ BATCH_PRIO: True
24
+ USE_TUTEL: True
25
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H192_experiments/7tasks_berttiny_training_moe_lsfp32_gate_softmax_layernorm_fp16.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "7tasks_berttiny_training.yaml"
2
+
3
+ MOE:
4
+ MOE: True
5
+ MOE_TYPE: 'attribute'
6
+ TAG_Transform: True
7
+ ATTRIBUTE_LENGTH: 8
8
+ EP_WORLD_SIZE: 1 # tag moe only
9
+ NUM_EXPERTS: 8
10
+ TOP_K: 2
11
+ CAPACITY_FACTOR: 3.0
12
+ EVAL_MIN_CAPACITY: 4.0
13
+ MIN_CAPACITY: 4
14
+ NOISY_GATE_POLICY: 'vmoe'
15
+ MOE_PARAM_GROUP: True
16
+ MOE_EXPERT_TYPE: 'FFN,SA'
17
+ SA_LINEAR_OUT_MOE: True
18
+ MOE_EXPERT_LOCATION: 'all' # 'odd'
19
+ # MOE_LAYER_START_IDX: 3
20
+ # MOE_LAYER_END_IDX: 21
21
+ # MOE_LAYER_START_IDX: 18
22
+ # MOE_LAYER_END_IDX: 12
23
+ BATCH_PRIO: True
24
+ USE_TUTEL: True
25
+ FFN_SHARE_GATE_DECISION: True
26
+
27
+ MODEL:
28
+ LAYER_SCALE_FP32: True
29
+ GATE_FP32: False
30
+ TAG_TRANSFORM_FP32: False
31
+
32
+
33
+ SOLVER:
34
+
35
+
36
+ FORCE_SOFTMAX_FP16: True
37
+ FORCE_LN_FP16: True
38
+ FORCE_NORM_FP16: True
39
+ # FORCE_TEMP_FP16: True
40
+ FORCE_EMBED_FP16: True
41
+
42
+ # FORCE_EXPERT_ADDING_FP16: True
configs/BERT_L12_H192_experiments/7tasks_berttiny_training_moe_scale_before.yaml ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+
18
+
19
+ TASKS:
20
+
21
+ -
22
+ NAME: imagenet
23
+ DATASETS:
24
+ TRAIN: 'ImageNetDataset'
25
+ VAL: 'ImageNetDataset'
26
+ TASK_TYPE: 'image_classification'
27
+ DATASET_NAME: 'ImageNet1k'
28
+ TARGET_SET: ['ImageNet1k']
29
+
30
+ DATALOADER:
31
+ TRAIN_BATCH_SIZE: 720
32
+ # TEST_BATCH_SIZE: 2
33
+ NUM_WORKERS: 4
34
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
35
+ S3_PATH: 'cluster2:s3://imagenet'
36
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
37
+ SAMPLING_WEIGHT: 2.5
38
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
39
+ MIXUP: 0.8
40
+ CUTMIX: 1.0
41
+ MIXUP_PROB: 1.0
42
+ MIXUP_SWITCH_PROB: 0.5
43
+ MIXUP_MODE: 'batch'
44
+ MIXUP_LABEL_SMOOTHING: 0.1
45
+ MODEL:
46
+ MAX_SEQ_LEN: -1
47
+ LABELS_NUM: 1000
48
+ TEMP_NAME: logit_scale_img_cls
49
+ LOSSES:
50
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
51
+ LOSS_WEIGHT: 1.0
52
+ REDUCTION: 'mean'
53
+ # LOSS_FP32: True
54
+ INFERENCE:
55
+ NAME: 'ImageNetEvaler'
56
+ ID_KEY: 'image_id'
57
+ VALUE: 'cls_logits'
58
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
59
+ TEST_ANNFILE: ''
60
+ GENERATION_MODE: False
61
+
62
+ -
63
+ NAME: bookswiki_pretrain
64
+ DATASETS:
65
+ TRAIN: 'GeneralCorpusDataset'
66
+ TASK_TYPE: 'text_mlm'
67
+ DATASET_NAME: 'BooksWiki'
68
+ TARGET_SET: ['Vocab_Word']
69
+ VERSION: 'v2'
70
+ DATALOADER:
71
+ TRAIN_BATCH_SIZE: 512
72
+ TEST_BATCH_SIZE: 32
73
+ NUM_WORKERS: 2
74
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
75
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
76
+ SEQ_PER_SAMPLE: 1
77
+ SAMPLER: NodeDistributed
78
+ CACHE_MODE: True
79
+ SEQ_PER_SAMPLE: 128
80
+ MIN_SEQ_PER_SAMPLE: 128
81
+ APPEND_EOS: True
82
+ ONE_STREAM: False
83
+ SAMPLING_WEIGHT: 3.5
84
+ RANDOM_MASK: True
85
+ MODEL:
86
+ MAX_SEQ_LEN: 128
87
+ TEMP_NAME: logit_scale_text_mlm
88
+ LOSSES:
89
+ NAMES: ['CrossEntropy', 'Accuracy']
90
+ LOSS_WEIGHT: 0.33333
91
+ REDUCTION: 'mean'
92
+ INFERENCE:
93
+ VOCAB: 'CLIP'
94
+ GENERATION_MODE: False
95
+
96
+ ########## Image Captioning ###########
97
+
98
+
99
+ -
100
+ NAME: cc12m_caption
101
+ DATASETS:
102
+ TRAIN: 'ImageTextPairDataset'
103
+ TASK_TYPE: 'image_caption'
104
+ DATASET_NAME: 'CC12M'
105
+ TARGET_SET: ['Vocab_Word']
106
+ DATALOADER:
107
+ TRAIN_BATCH_SIZE: 300
108
+ TEST_BATCH_SIZE: 32
109
+ NUM_WORKERS: 2
110
+ S3_ANNO_FOLDER: 's3://cc12m/'
111
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
112
+ ANNO_FILENAME: 'train_available.json'
113
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
114
+ S3_PATH: 's3://cc12m/'
115
+ SEQ_PER_SAMPLE: 1
116
+ SAMPLER: NodeDistributed
117
+ CACHE_MODE: True
118
+ CIRCULAR_CACHE_MODE: False
119
+ ZIP_MODE: False
120
+ CACHE_ORIGIN_IMAGE: False
121
+ RANDOM_CAPTION: False
122
+ AS_NUMPY_AS_POSSIBLE: False
123
+ SAMPLING_WEIGHT: 1.6889
124
+ TRANSFORM: 'clip_transforms'
125
+ MODEL:
126
+ MAX_SEQ_LEN: 50
127
+ TEMP_NAME: logit_scale_caption
128
+ LOSSES:
129
+ NAMES: ['CrossEntropy', 'Accuracy']
130
+ LOSS_WEIGHT: 0.33333
131
+ REDUCTION: 'mean'
132
+ INFERENCE:
133
+ VOCAB: 'CLIP'
134
+ GENERATION_MODE: False
135
+
136
+ -
137
+ NAME: cc3m_caption
138
+ DATASETS:
139
+ TRAIN: 'ImageTextPairDataset'
140
+ TASK_TYPE: 'image_caption'
141
+ DATASET_NAME: 'CC3M'
142
+ TARGET_SET: ['Vocab_Word']
143
+ DATALOADER:
144
+ TRAIN_BATCH_SIZE: 300
145
+ TEST_BATCH_SIZE: 32
146
+ NUM_WORKERS: 2
147
+ ANNO_FOLDER: 's3://cc3m/'
148
+ ANNO_FILENAME: 'train_spacy.json'
149
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
150
+ S3_PATH: 's3://cc3m/'
151
+ SEQ_PER_SAMPLE: 1
152
+ SAMPLER: NodeDistributed
153
+ CACHE_MODE: True
154
+ CIRCULAR_CACHE_MODE: False
155
+ ZIP_MODE: False
156
+ CACHE_ORIGIN_IMAGE: False
157
+ RANDOM_CAPTION: False
158
+ AS_NUMPY_AS_POSSIBLE: False
159
+ SAMPLING_WEIGHT: 0.8780
160
+ TRANSFORM: 'clip_transforms'
161
+ MODEL:
162
+ MAX_SEQ_LEN: 50
163
+ TEMP_NAME: logit_scale_caption
164
+ LOSSES:
165
+ NAMES: ['CrossEntropy', 'Accuracy']
166
+ LOSS_WEIGHT: 0.33333
167
+ REDUCTION: 'mean'
168
+ INFERENCE:
169
+ VOCAB: 'CLIP'
170
+ GENERATION_MODE: False
171
+
172
+ -
173
+ NAME: vg_caption
174
+ DATASETS:
175
+ TRAIN: 'ImageTextPairDataset'
176
+ TASK_TYPE: 'image_caption'
177
+ DATASET_NAME: 'VG'
178
+ TARGET_SET: ['Vocab_Word']
179
+ DATALOADER:
180
+ TRAIN_BATCH_SIZE: 300
181
+ TEST_BATCH_SIZE: 32
182
+ NUM_WORKERS: 2
183
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
184
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
185
+ S3_PATH: 's3://visual_genome/images'
186
+ ANNO_FILENAME: 'vg_captions_128filter.json'
187
+ SEQ_PER_SAMPLE: 1
188
+ CACHE_MODE: True
189
+ CIRCULAR_CACHE_MODE: False
190
+ ZIP_MODE: False
191
+ CACHE_ORIGIN_IMAGE: False
192
+ RANDOM_CAPTION: False
193
+ AS_NUMPY_AS_POSSIBLE: False
194
+ SAMPLING_WEIGHT: 0.5895
195
+ TRANSFORM: 'clip_transforms'
196
+ MODEL:
197
+ MAX_SEQ_LEN: 30
198
+ TEMP_NAME: logit_scale_caption
199
+ LOSSES:
200
+ NAMES: ['CrossEntropy', 'Accuracy']
201
+ LOSS_WEIGHT: 0.33333
202
+ REDUCTION: 'mean'
203
+ INFERENCE:
204
+ VOCAB: 'CLIP'
205
+ GENERATION_MODE: True
206
+
207
+ -
208
+ NAME: mscoco_caption
209
+ DATASETS:
210
+ TRAIN: 'ImageTextPairDataset'
211
+ # VAL: 'ImageTextPairDataset'
212
+ # TEST: 'ImageTextPairDataset'
213
+ TASK_TYPE: 'image_caption'
214
+ DATASET_NAME: 'MSCOCO'
215
+ TARGET_SET: ['Vocab_Word']
216
+ DATALOADER:
217
+ TRAIN_BATCH_SIZE: 300
218
+ TEST_BATCH_SIZE: 32
219
+ NUM_WORKERS: 4
220
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
221
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
222
+ S3_PATH: 's3://coco/'
223
+ SEQ_PER_SAMPLE: 1
224
+ CACHE_MODE: True
225
+ CIRCULAR_CACHE_MODE: False
226
+ ZIP_MODE: False
227
+ CACHE_ORIGIN_IMAGE: False
228
+ RANDOM_CAPTION: False
229
+ AS_NUMPY_AS_POSSIBLE: False
230
+ SAMPLING_WEIGHT: 0.3817
231
+ TRANSFORM: 'clip_transforms'
232
+ RANDOM_MASK: True
233
+ MODEL:
234
+ MAX_SEQ_LEN: 50
235
+ EVAL_MAX_SEQ_LEN: 21
236
+ TEMP_NAME: logit_scale_caption
237
+ LOSSES:
238
+ NAMES: ['CrossEntropy', 'Accuracy']
239
+ LOSS_WEIGHT: 0.33333
240
+ REDUCTION: 'mean'
241
+ DECODE_STRATEGY:
242
+ NAME: 'CaptionBeamSearcherV3'
243
+ BEAM_SIZE: 2
244
+ # LEN_PENALTY: 1.0
245
+ INFERENCE:
246
+ NAME: 'COCOEvaler'
247
+ VOCAB: 'CLIP'
248
+ ID_KEY: 'image_id'
249
+ VALUE: 'caption'
250
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
251
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
252
+ GENERATION_MODE: True
253
+
254
+ -
255
+ NAME: sbu_caption
256
+ DATASETS:
257
+ TRAIN: 'ImageTextPairDataset'
258
+ TASK_TYPE: 'image_caption'
259
+ DATASET_NAME: 'SBU'
260
+ TARGET_SET: ['Vocab_Word']
261
+ DATALOADER:
262
+ TRAIN_BATCH_SIZE: 300
263
+ TEST_BATCH_SIZE: 32
264
+ NUM_WORKERS: 1
265
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
266
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
267
+ ANNO_FILENAME: 'subcaption.json'
268
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
269
+ S3_PATH: 's3://SBU/images'
270
+ SEQ_PER_SAMPLE: 1
271
+ SAMPLER: NodeDistributed
272
+ CACHE_MODE: True
273
+ CIRCULAR_CACHE_MODE: False
274
+ ZIP_MODE: False
275
+ CACHE_ORIGIN_IMAGE: False
276
+ RANDOM_CAPTION: False
277
+ AS_NUMPY_AS_POSSIBLE: False
278
+ SAMPLING_WEIGHT: 0.4618
279
+ TRANSFORM: 'clip_transforms'
280
+ MODEL:
281
+ MAX_SEQ_LEN: 50
282
+ TEMP_NAME: logit_scale_caption
283
+ LOSSES:
284
+ NAMES: ['CrossEntropy', 'Accuracy']
285
+ LOSS_WEIGHT: 0.33333
286
+ REDUCTION: 'mean'
287
+ INFERENCE:
288
+ VOCAB: 'CLIP'
289
+ GENERATION_MODE: False
290
+
291
+
292
+ ENGINE:
293
+ NAME: 'UnifiedTrainer'
294
+
295
+ MODEL:
296
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
297
+ ENCODER: 'UnifiedBertEncoder'
298
+
299
+ IN_TUNING: True # use IN1k instead of 22k
300
+ SHARE_LAYERNORM: True
301
+ BERT:
302
+ NORMALIZE_DECISION: "BERTPre"
303
+ DROP_PATH_PROB: 0.0
304
+ DROP_PATH_PROB_FIXED: True
305
+ SCALE_MULTI_BEFORE: True
306
+
307
+ MODEL_EMA: False
308
+ MODEL_EMA_DECAY: 0.9999
309
+
310
+ MAEParamsInit: True
311
+ POSEMBEDFIX: True
312
+
313
+
314
+ IMG_INPUT_SIZE: 224
315
+ PATCH_SIZE: 16
316
+
317
+ LAYER_SCALE: True
318
+ LAYER_SCALE_INIT: 1e-3
319
+
320
+
321
+ DATALOADER:
322
+ USE_WEIGHTED_SAMPLER: True
323
+ UNIFIED_DATASET: True
324
+ NUM_WORKERS: 32
325
+
326
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
327
+
328
+
329
+
330
+ ####################################### Optimizer #######################################
331
+ SOLVER:
332
+ NAME: 'Adam'
333
+ TORCH_OPTIMIZER: True
334
+ PARAMS_SEPERATE: True
335
+ # PARAMS_GROUP: True
336
+ # EPOCH: 1
337
+ MAX_ITER: 150000
338
+ CHECKPOINT_PERIOD: 5000
339
+ EVAL_PERIOD: 500000
340
+ BASE_LR: 0.001
341
+ BIAS_LR_FACTOR: 1.0
342
+ WEIGHT_DECAY: 0.05
343
+ WEIGHT_DECAY_NORM: 0.0
344
+ WEIGHT_DECAY_BIAS: 0.0
345
+ WEIGHT_DECAY_EMBEDDING: 0.0
346
+ MOMENTUM: 0.9
347
+ DAMPENING: 0.0
348
+ NESTEROV: 0.0
349
+ BETAS: [0.9, 0.95]
350
+ EPS: 1e-6
351
+ GRAD_CLIP: 0.1
352
+ GRAD_CLIP_TYPE: 'norm'
353
+ ACCUM_ITER: 0
354
+ AMP_FP16: True
355
+ APEX_FP16: False # dangerous
356
+
357
+ WRITE_PERIOD: 50
358
+ MIN_LOSS_SCLE: 2048.0
359
+ # BF16: False # True
360
+ # ZEROSTAGE: 2
361
+
362
+ LOSS_SCALE_WINDOW: 200
363
+
364
+
365
+
366
+
367
+
368
+
369
+ ####################################### lr scheduler #######################################
370
+ LR_SCHEDULER:
371
+ NAME: 'WarmupCosine'
372
+ WARMUP: 5000
373
+ MIN_LR: 0.000001
374
+
375
+
376
+
377
+
378
+ ####################################### evaluation #######################################
379
+ INFERENCE:
380
+
381
+ VOCAB: 'CLIP'
382
+ ITER_BASED: True
383
+
384
+
385
+ find_unused_parameters: true
386
+
387
+ # ENCODERS:
388
+ # -
389
+ # NAME: VisualEncoder
390
+ # TYPE: VisualEncoder
391
+ # DROP_PATH_PROB: 0.0
392
+ # HIDDEN_SIZE: 192
393
+ # HIDDEN_DROPOUT_PROB: 0.
394
+ # HIDDEN_ACT: "gelu"
395
+ # NUM_ATTENTION_HEADS: 3
396
+ # INTERMEDIATE_SIZE: 768
397
+ # INTERMEDIATE_DROP: 0.
398
+ # FFN_DROPOUT_PROB: 0.
399
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
400
+ # NUM_HIDDEN_LAYERS: 6
401
+ # NUM_GENERATION_LAYERS: 0
402
+ # DROP_PATH_PROB_FIXED: True
403
+
404
+ # -
405
+ # NAME: TextEncoder
406
+ # TYPE: TextEncoder
407
+ # DROP_PATH_PROB: 0.0
408
+ # HIDDEN_SIZE: 192
409
+ # HIDDEN_DROPOUT_PROB: 0.
410
+ # HIDDEN_ACT: "gelu"
411
+ # NUM_ATTENTION_HEADS: 3
412
+ # INTERMEDIATE_SIZE: 768
413
+ # INTERMEDIATE_DROP: 0.
414
+ # FFN_DROPOUT_PROB: 0.
415
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
416
+ # NUM_HIDDEN_LAYERS: 6
417
+ # NUM_GENERATION_LAYERS: 0
418
+ # DROP_PATH_PROB_FIXED: True
419
+
420
+
421
+
422
+ MOE:
423
+ MOE: True
424
+ MOE_TYPE: 'attribute'
425
+ TAG_Transform: True
426
+ ATTRIBUTE_LENGTH: 8
427
+ EP_WORLD_SIZE: 1 # tag moe only
428
+ NUM_EXPERTS: 8
429
+ TOP_K: 2
430
+ CAPACITY_FACTOR: 3.0
431
+ EVAL_MIN_CAPACITY: 4.0
432
+ MIN_CAPACITY: 4
433
+ NOISY_GATE_POLICY: 'vmoe'
434
+ MOE_PARAM_GROUP: True
435
+ MOE_EXPERT_TYPE: 'FFN,SA'
436
+ SA_LINEAR_OUT_MOE: True
437
+ MOE_EXPERT_LOCATION: 'all' # 'odd'
438
+ # MOE_LAYER_START_IDX: 3
439
+ # MOE_LAYER_END_IDX: 21
440
+ # MOE_LAYER_START_IDX: 18
441
+ # MOE_LAYER_END_IDX: 12
442
+ BATCH_PRIO: True
443
+ USE_TUTEL: True
444
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H192_experiments/base_model_bert_l12_h192.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ######################################### MODEL #########################################
3
+ MODEL:
4
+ VOCAB_SIZE: 49411 # include <BOS>/<EOS>
5
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
6
+ ENCODER: 'UnifiedBertEncoder_v3'
7
+ ENCODER_DIM: 192
8
+ DECODER: ''
9
+ DECODER_DIM: 192
10
+
11
+ PREDICTOR: 'EmbedClsAsRetrievalPredictor'
12
+ FEATURE_GATHER: True
13
+ LEARN_TEMP: True
14
+ PRED_USE_NORM: True
15
+ PRED_TEMPERATURE: 0.07
16
+
17
+ BertParamsInit: True
18
+
19
+ CLS_TOKEN: False
20
+
21
+ QUEUE_LEN: 1024
22
+ MAX_LABEL_LEN: 12
23
+
24
+ OUTPUT_PROJ: True # output projection
25
+
26
+
27
+ # #################################### Token embedding ####################################
28
+ TOKEN_EMBED:
29
+ NAME: 'TokenBaseEmbedding'
30
+ DIM: 192
31
+ ACTIVATION: 'none'
32
+ USE_NORM: True
33
+ DROPOUT: 0.0
34
+ POSITION: 'NNEmbeddingEncoding'
35
+ POSITION_MAX_LEN: 512
36
+ TYPE_VOCAB_SIZE: 2
37
+
38
+ # #################################### Visual embedding ####################################
39
+ VISUAL_EMBED:
40
+ NAME: 'none'
41
+
42
+ # #################################### video embedding ####################################
43
+ VIDEO_EMBED:
44
+ NAME: 'VideoBaseEmbedding'
45
+ IN_DIM: 768
46
+ OUT_DIM: 192
47
+ ACTIVATION: 'none'
48
+ USE_NORM: True
49
+ DROPOUT: 0.0
50
+ TYPE_SIZE: 1 # video to encoder
51
+ POSITION: 'NNEmbeddingEncoding'
52
+ MAX_LENGTH: 1600
53
+ PATCH_SIZE_S: 16
54
+ PATCH_SIZE_T: 1
55
+ DIVIDE_ST_POS: True
56
+ USE_VISUAL_TOKENIZER: True
57
+ USE_VISUAL_POS: True
58
+ MAX_FRAMES: 8
59
+
60
+ ####################################### BERT ############################################
61
+ BERT:
62
+ DROP_PATH_PROB: 0.0
63
+ HIDDEN_SIZE: 192
64
+ HIDDEN_DROPOUT_PROB: 0.
65
+ HIDDEN_ACT: "gelu"
66
+ NUM_ATTENTION_HEADS: 3
67
+ INTERMEDIATE_SIZE: 768
68
+ INTERMEDIATE_DROP: 0.
69
+ FFN_DROPOUT_PROB: 0.
70
+ ATTENTION_PROBS_DROPOUT_PROB: 0.
71
+ NUM_HIDDEN_LAYERS: 12
72
+ NUM_GENERATION_LAYERS: 0
73
+
configs/BERT_L12_H192_experiments/in1k_training.yaml ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ # -
12
+ # NAME: 'Vocab_Word'
13
+ # SHARED_TARGETS_CFG:
14
+ # FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ # DISTRIBUTED: True
16
+
17
+
18
+
19
+ TASKS:
20
+
21
+ -
22
+ NAME: imagenet
23
+ DATASETS:
24
+ TRAIN: 'ImageNetDataset'
25
+ VAL: 'ImageNetDataset'
26
+ TASK_TYPE: 'image_classification'
27
+ DATASET_NAME: 'ImageNet1k'
28
+ TARGET_SET: ['ImageNet1k']
29
+
30
+ DATALOADER:
31
+ TRAIN_BATCH_SIZE: 4
32
+ TEST_BATCH_SIZE: 4
33
+ NUM_WORKERS: 4
34
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
35
+ S3_PATH: 'cluster2:s3://imagenet'
36
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
37
+ SAMPLING_WEIGHT: 1.0
38
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
39
+ MIXUP: 0.8
40
+ CUTMIX: 1.0
41
+ MIXUP_PROB: 1.0
42
+ MIXUP_SWITCH_PROB: 0.5
43
+ MIXUP_MODE: 'batch'
44
+ MIXUP_LABEL_SMOOTHING: 0.1
45
+ MODEL:
46
+ MAX_SEQ_LEN: -1
47
+ LABELS_NUM: 1000
48
+ TEMP_NAME: logit_scale_img_cls
49
+ LOSSES:
50
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
51
+ LOSS_WEIGHT: 1.0
52
+ REDUCTION: 'mean'
53
+ # LOSS_FP32: True
54
+ INFERENCE:
55
+ NAME: 'ImageNetEvaler'
56
+ ID_KEY: 'image_id'
57
+ VALUE: 'cls_logits'
58
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
59
+ TEST_ANNFILE: ''
60
+ GENERATION_MODE: False
61
+
62
+
63
+ ENGINE:
64
+ NAME: 'UnifiedTrainer'
65
+
66
+ MODEL:
67
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
68
+ ENCODER: 'UnifiedBertEncoder'
69
+
70
+ IN_TUNING: True # use IN1k instead of 22k
71
+ SHARE_LAYERNORM: True
72
+ BERT:
73
+ NORMALIZE_DECISION: "BERTPre"
74
+ DROP_PATH_PROB: 0.1
75
+ NUM_HIDDEN_LAYERS: 1
76
+ DROP_PATH_PROB_FIXED: True
77
+
78
+ UNIFY_QKV: True
79
+
80
+
81
+ OLD_CHECKPONT: True
82
+
83
+ MODEL_EMA: False
84
+ MODEL_EMA_DECAY: 0.9999
85
+
86
+ MAEParamsInit: True
87
+ POSEMBEDFIX: True
88
+
89
+
90
+ IMG_INPUT_SIZE: 224
91
+ PATCH_SIZE: 16
92
+ # POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
93
+ # CHECKPOINT_FILETER: False
94
+
95
+ LAYER_SCALE: True
96
+ LAYER_SCALE_INIT: 1e-3
97
+
98
+
99
+ DATALOADER:
100
+ USE_WEIGHTED_SAMPLER: True
101
+ UNIFIED_DATASET: True
102
+ NUM_WORKERS: 16
103
+
104
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
105
+
106
+
107
+
108
+ ####################################### Optimizer #######################################
109
+ SOLVER:
110
+ NAME: 'Adam'
111
+ TORCH_OPTIMIZER: True
112
+ PARAMS_SEPERATE: True
113
+ # PARAMS_GROUP: True
114
+ # EPOCH: 1
115
+ MAX_ITER: 150000
116
+ CHECKPOINT_PERIOD: 5000
117
+ EVAL_PERIOD: 500000
118
+ BASE_LR: 0.001
119
+ BIAS_LR_FACTOR: 1.0
120
+ WEIGHT_DECAY: 0.05
121
+ WEIGHT_DECAY_NORM: 0.0
122
+ WEIGHT_DECAY_BIAS: 0.0
123
+ WEIGHT_DECAY_EMBEDDING: 0.0
124
+ MOMENTUM: 0.9
125
+ DAMPENING: 0.0
126
+ NESTEROV: 0.0
127
+ BETAS: [0.9, 0.95]
128
+ EPS: 1e-6
129
+ GRAD_CLIP: 0.1
130
+ GRAD_CLIP_TYPE: 'norm'
131
+ ACCUM_ITER: 0
132
+ AMP_FP16: True
133
+ APEX_FP16: False # dangerous
134
+
135
+ WRITE_PERIOD: 50
136
+ MIN_LOSS_SCLE: 2048.0
137
+ # BF16: False # True
138
+ # ZEROSTAGE: 2
139
+
140
+ LOSS_SCALE_WINDOW: 200
141
+
142
+
143
+
144
+
145
+
146
+
147
+ ####################################### lr scheduler #######################################
148
+ LR_SCHEDULER:
149
+ NAME: 'WarmupCosine'
150
+ WARMUP: 5000
151
+ MIN_LR: 0.000001
152
+
153
+
154
+
155
+
156
+ ####################################### evaluation #######################################
157
+ INFERENCE:
158
+
159
+ VOCAB: 'CLIP'
160
+ ITER_BASED: True
161
+
162
+
163
+ find_unused_parameters: true
164
+
165
+ # ENCODERS:
166
+ # -
167
+ # NAME: VisualEncoder
168
+ # TYPE: VisualEncoder
169
+ # DROP_PATH_PROB: 0.0
170
+ # HIDDEN_SIZE: 192
171
+ # HIDDEN_DROPOUT_PROB: 0.
172
+ # HIDDEN_ACT: "gelu"
173
+ # NUM_ATTENTION_HEADS: 3
174
+ # INTERMEDIATE_SIZE: 768
175
+ # INTERMEDIATE_DROP: 0.
176
+ # FFN_DROPOUT_PROB: 0.
177
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
178
+ # NUM_HIDDEN_LAYERS: 6
179
+ # NUM_GENERATION_LAYERS: 0
180
+ # DROP_PATH_PROB_FIXED: True
181
+
182
+ # -
183
+ # NAME: TextEncoder
184
+ # TYPE: TextEncoder
185
+ # DROP_PATH_PROB: 0.0
186
+ # HIDDEN_SIZE: 192
187
+ # HIDDEN_DROPOUT_PROB: 0.
188
+ # HIDDEN_ACT: "gelu"
189
+ # NUM_ATTENTION_HEADS: 3
190
+ # INTERMEDIATE_SIZE: 768
191
+ # INTERMEDIATE_DROP: 0.
192
+ # FFN_DROPOUT_PROB: 0.
193
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
194
+ # NUM_HIDDEN_LAYERS: 6
195
+ # NUM_GENERATION_LAYERS: 0
196
+ # DROP_PATH_PROB_FIXED: True
197
+
configs/BERT_L12_H192_experiments/in1k_training_moe.yaml ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ # -
12
+ # NAME: 'Vocab_Word'
13
+ # SHARED_TARGETS_CFG:
14
+ # FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ # DISTRIBUTED: True
16
+
17
+
18
+
19
+ TASKS:
20
+
21
+ -
22
+ NAME: imagenet
23
+ DATASETS:
24
+ TRAIN: 'ImageNetDataset'
25
+ VAL: 'ImageNetDataset'
26
+ TASK_TYPE: 'image_classification'
27
+ DATASET_NAME: 'ImageNet1k'
28
+ TARGET_SET: ['ImageNet1k']
29
+
30
+ DATALOADER:
31
+ TRAIN_BATCH_SIZE: 4
32
+ TEST_BATCH_SIZE: 4
33
+ NUM_WORKERS: 4
34
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
35
+ S3_PATH: 'cluster2:s3://imagenet'
36
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
37
+ SAMPLING_WEIGHT: 1.0
38
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
39
+ MIXUP: 0.8
40
+ CUTMIX: 1.0
41
+ MIXUP_PROB: 1.0
42
+ MIXUP_SWITCH_PROB: 0.5
43
+ MIXUP_MODE: 'batch'
44
+ MIXUP_LABEL_SMOOTHING: 0.1
45
+ MODEL:
46
+ MAX_SEQ_LEN: -1
47
+ LABELS_NUM: 1000
48
+ TEMP_NAME: logit_scale_img_cls
49
+ LOSSES:
50
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
51
+ LOSS_WEIGHT: 1.0
52
+ REDUCTION: 'mean'
53
+ # LOSS_FP32: True
54
+ INFERENCE:
55
+ NAME: 'ImageNetEvaler'
56
+ ID_KEY: 'image_id'
57
+ VALUE: 'cls_logits'
58
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
59
+ TEST_ANNFILE: ''
60
+ GENERATION_MODE: False
61
+
62
+
63
+ ENGINE:
64
+ NAME: 'UnifiedTrainer'
65
+
66
+ MODEL:
67
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
68
+ ENCODER: 'UnifiedBertEncoder'
69
+
70
+ IN_TUNING: True # use IN1k instead of 22k
71
+ SHARE_LAYERNORM: True
72
+ BERT:
73
+ NORMALIZE_DECISION: "BERTPre"
74
+ DROP_PATH_PROB: 0.0
75
+ DROP_PATH_PROB_FIXED: True
76
+
77
+ UNIFY_QKV: True
78
+
79
+
80
+ OLD_CHECKPONT: True
81
+
82
+ MODEL_EMA: False
83
+ MODEL_EMA_DECAY: 0.9999
84
+
85
+ MAEParamsInit: True
86
+ POSEMBEDFIX: True
87
+
88
+
89
+ IMG_INPUT_SIZE: 224
90
+ PATCH_SIZE: 16
91
+ # POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
92
+ # CHECKPOINT_FILETER: False
93
+
94
+ LAYER_SCALE: True
95
+ LAYER_SCALE_INIT: 1e-3
96
+
97
+
98
+ DATALOADER:
99
+ USE_WEIGHTED_SAMPLER: True
100
+ UNIFIED_DATASET: True
101
+ NUM_WORKERS: 16
102
+
103
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
104
+
105
+
106
+
107
+ ####################################### Optimizer #######################################
108
+ SOLVER:
109
+ NAME: 'Adam'
110
+ TORCH_OPTIMIZER: True
111
+ PARAMS_SEPERATE: True
112
+ # PARAMS_GROUP: True
113
+ # EPOCH: 1
114
+ MAX_ITER: 150000
115
+ CHECKPOINT_PERIOD: 5000
116
+ EVAL_PERIOD: 500000
117
+ BASE_LR: 0.001
118
+ BIAS_LR_FACTOR: 1.0
119
+ WEIGHT_DECAY: 0.05
120
+ WEIGHT_DECAY_NORM: 0.0
121
+ WEIGHT_DECAY_BIAS: 0.0
122
+ WEIGHT_DECAY_EMBEDDING: 0.0
123
+ MOMENTUM: 0.9
124
+ DAMPENING: 0.0
125
+ NESTEROV: 0.0
126
+ BETAS: [0.9, 0.95]
127
+ EPS: 1e-6
128
+ GRAD_CLIP: 0.1
129
+ GRAD_CLIP_TYPE: 'norm'
130
+ ACCUM_ITER: 0
131
+ AMP_FP16: True
132
+ APEX_FP16: False # dangerous
133
+
134
+ WRITE_PERIOD: 50
135
+ MIN_LOSS_SCLE: 2048.0
136
+ # BF16: False # True
137
+ # ZEROSTAGE: 2
138
+
139
+ LOSS_SCALE_WINDOW: 200
140
+
141
+
142
+
143
+
144
+
145
+
146
+ ####################################### lr scheduler #######################################
147
+ LR_SCHEDULER:
148
+ NAME: 'WarmupCosine'
149
+ WARMUP: 5000
150
+ MIN_LR: 0.000001
151
+
152
+
153
+
154
+
155
+ ####################################### evaluation #######################################
156
+ INFERENCE:
157
+
158
+ VOCAB: 'CLIP'
159
+ ITER_BASED: True
160
+
161
+
162
+ find_unused_parameters: true
163
+
164
+ # ENCODERS:
165
+ # -
166
+ # NAME: VisualEncoder
167
+ # TYPE: VisualEncoder
168
+ # DROP_PATH_PROB: 0.0
169
+ # HIDDEN_SIZE: 192
170
+ # HIDDEN_DROPOUT_PROB: 0.
171
+ # HIDDEN_ACT: "gelu"
172
+ # NUM_ATTENTION_HEADS: 3
173
+ # INTERMEDIATE_SIZE: 768
174
+ # INTERMEDIATE_DROP: 0.
175
+ # FFN_DROPOUT_PROB: 0.
176
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
177
+ # NUM_HIDDEN_LAYERS: 6
178
+ # NUM_GENERATION_LAYERS: 0
179
+ # DROP_PATH_PROB_FIXED: True
180
+
181
+ # -
182
+ # NAME: TextEncoder
183
+ # TYPE: TextEncoder
184
+ # DROP_PATH_PROB: 0.0
185
+ # HIDDEN_SIZE: 192
186
+ # HIDDEN_DROPOUT_PROB: 0.
187
+ # HIDDEN_ACT: "gelu"
188
+ # NUM_ATTENTION_HEADS: 3
189
+ # INTERMEDIATE_SIZE: 768
190
+ # INTERMEDIATE_DROP: 0.
191
+ # FFN_DROPOUT_PROB: 0.
192
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
193
+ # NUM_HIDDEN_LAYERS: 6
194
+ # NUM_GENERATION_LAYERS: 0
195
+ # DROP_PATH_PROB_FIXED: True
196
+
197
+ MOE:
198
+ MOE: True
199
+ MOE_TYPE: 'attribute'
200
+ TAG_Transform: True
201
+ ATTRIBUTE_LENGTH: 8
202
+ EP_WORLD_SIZE: 1 # tag moe only
203
+ NUM_EXPERTS: 8
204
+ TOP_K: 2
205
+ CAPACITY_FACTOR: 3.0
206
+ EVAL_MIN_CAPACITY: 4.0
207
+ MIN_CAPACITY: 4
208
+ NOISY_GATE_POLICY: 'vmoe'
209
+ MOE_PARAM_GROUP: True
210
+ MOE_EXPERT_TYPE: 'FFN,SA'
211
+ SA_LINEAR_OUT_MOE: True
212
+ MOE_EXPERT_LOCATION: 'all' # 'odd'
213
+ # MOE_LAYER_START_IDX: 3
214
+ # MOE_LAYER_END_IDX: 21
215
+ # MOE_LAYER_START_IDX: 18
216
+ # MOE_LAYER_END_IDX: 12
217
+ BATCH_PRIO: True
218
+ USE_TUTEL: True
219
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H192_experiments/moe_debug.yaml ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ # -
6
+ # NAME: 'ImageNet1k'
7
+ # SHARED_TARGETS_CFG:
8
+ # FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ # DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ # -
18
+ # NAME: 'Kinetics400'
19
+ # SHARED_TARGETS_CFG:
20
+ # FILE_PATH: 'open_source_dataset/k400_class_name_CLIP_with_endoftext.pkl'
21
+ # DISTRIBUTED: False
22
+
23
+
24
+
25
+ TASKS:
26
+
27
+ # -
28
+ # NAME: imagenet
29
+ # DATASETS:
30
+ # TRAIN: 'ImageNetDataset'
31
+ # VAL: 'ImageNetDataset'
32
+ # TASK_TYPE: 'image_classification'
33
+ # DATASET_NAME: 'ImageNet1k'
34
+ # TARGET_SET: ['ImageNet1k']
35
+
36
+ # DATALOADER:
37
+ # TRAIN_BATCH_SIZE: 720
38
+ # # TEST_BATCH_SIZE: 2
39
+ # NUM_WORKERS: 4
40
+ # FEATS_FOLDER: 'cluster2:s3://imagenet'
41
+ # ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
42
+ # SAMPLING_WEIGHT: 2.5
43
+ # CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
44
+ # MIXUP: 0.8
45
+ # CUTMIX: 1.0
46
+ # MIXUP_PROB: 1.0
47
+ # MIXUP_SWITCH_PROB: 0.5
48
+ # MIXUP_MODE: 'batch'
49
+ # MIXUP_LABEL_SMOOTHING: 0.1
50
+ # MODEL:
51
+ # MAX_SEQ_LEN: -1
52
+ # LABELS_NUM: 1000
53
+ # TEMP_NAME: logit_scale_img_cls
54
+ # LOSSES:
55
+ # NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
56
+ # LOSS_WEIGHT: 1.0
57
+ # REDUCTION: 'mean'
58
+ # # LOSS_FP32: True
59
+ # INFERENCE:
60
+ # NAME: 'ImageNetEvaler'
61
+ # ID_KEY: 'image_id'
62
+ # VALUE: 'cls_logits'
63
+ # VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
64
+ # TEST_ANNFILE: ''
65
+ # GENERATION_MODE: False
66
+
67
+ # -
68
+ # NAME: K400_retrieve
69
+ # DATASETS:
70
+ # TRAIN: 'VideoDataSet'
71
+ # VAL: 'VideoDataSet'
72
+ # TASK_TYPE: 'video_classification'
73
+ # DATASET_NAME: 'K400'
74
+ # TARGET_SET: ['Kinetics400']
75
+ # DATALOADER:
76
+ # TRAIN_BATCH_SIZE: 12 # 256
77
+ # TEST_BATCH_SIZE: 4 # debug
78
+ # NUM_WORKERS: 4 # debug 4
79
+ # FEATS_FOLDER: 'open_source_dataset/K400_official'
80
+ # ANNO_FOLDER: 'open_source_dataset/K400_official'
81
+ # S3_PATH: 's3://K400/'
82
+ # FRAMES_PER_CLIP: 8
83
+ # STRIDE: 32
84
+ # FILE_EXTENSION: ''
85
+ # ANNO_FILE: 'annotation.json'
86
+ # TIMESFORMER_AUG: True
87
+ # SAMPLING_WEIGHT: 1.0
88
+ # MODEL:
89
+ # MAX_SEQ_LEN: -1
90
+ # TEMP_NAME: logit_scale_video_cls
91
+ # LOSSES:
92
+ # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
93
+ # LABELSMOOTHING: 0.1
94
+ # LOSS_WEIGHT: 1.0
95
+ # INFERENCE:
96
+ # NAME: 'MiTEvaler'
97
+ # ID_KEY: 'video_name'
98
+ # VALUE: 'label'
99
+ # VAL_ANNFILE: 'open_source_dataset/K400_official/annotation.json'
100
+ # TEST_ANNFILE: ''
101
+ # GENERATION_MODE: False
102
+ # NUM_VIEWS: 1
103
+
104
+ # -
105
+ # NAME: bookswiki_pretrain
106
+ # DATASETS:
107
+ # TRAIN: 'GeneralCorpusDataset'
108
+ # TASK_TYPE: 'text_mlm'
109
+ # DATASET_NAME: 'BooksWiki'
110
+ # TARGET_SET: ['Vocab_Word']
111
+ # VERSION: 'v2'
112
+ # DATALOADER:
113
+ # TRAIN_BATCH_SIZE: 512
114
+ # TEST_BATCH_SIZE: 32
115
+ # NUM_WORKERS: 2
116
+ # ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
117
+ # # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
118
+ # SEQ_PER_SAMPLE: 1
119
+ # SAMPLER: NodeDistributed
120
+ # CACHE_MODE: True
121
+ # SEQ_PER_SAMPLE: 128
122
+ # MIN_SEQ_PER_SAMPLE: 128
123
+ # APPEND_EOS: True
124
+ # ONE_STREAM: False
125
+ # SAMPLING_WEIGHT: 3.5
126
+ # RANDOM_MASK: True
127
+ # MODEL:
128
+ # MAX_SEQ_LEN: 128
129
+ # TEMP_NAME: logit_scale_text_mlm
130
+ # LOSSES:
131
+ # NAMES: ['CrossEntropy', 'Accuracy']
132
+ # LOSS_WEIGHT: 0.33333
133
+ # REDUCTION: 'mean'
134
+ # INFERENCE:
135
+ # VOCAB: 'CLIP'
136
+ # GENERATION_MODE: False
137
+ # -
138
+ # NAME: mscoco_retrieve
139
+ # DATASETS:
140
+ # TRAIN: 'ImageTextPairDataset'
141
+ # TEST: 'ImageTextPairDataset'
142
+ # TASK_TYPE: 'image_retrieval'
143
+ # DATASET_NAME: 'MSCOCO'
144
+ # DATALOADER:
145
+ # TRAIN_BATCH_SIZE: 100
146
+ # TEST_BATCH_SIZE: 32
147
+ # NUM_WORKERS: 1
148
+ # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
149
+ # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
150
+ # S3_PATH: 's3://coco/'
151
+ # SEQ_PER_SAMPLE: 1
152
+ # CACHE_MODE: True
153
+ # CIRCULAR_CACHE_MODE: False
154
+ # ZIP_MODE: False
155
+ # CACHE_ORIGIN_IMAGE: False
156
+ # RANDOM_CAPTION: False
157
+ # AS_NUMPY_AS_POSSIBLE: False
158
+ # SAMPLING_WEIGHT: 1.0
159
+ # TRANSFORM: 'clip_transforms'
160
+ # MODEL:
161
+ # MAX_SEQ_LEN: 50
162
+ # TEMP_NAME: logit_scale_retrieve
163
+ # LOSSES:
164
+ # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
165
+ # LABELSMOOTHING: 0.1
166
+ # LOSS_WEIGHT: 1.0
167
+ # REDUCTION: 'mean'
168
+ # INFERENCE:
169
+ # VOCAB: 'CLIP'
170
+ # ID_KEY: 'image_id'
171
+ # VALUE: 'caption'
172
+ # NAME: 'RetrievalEvaler'
173
+ # VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
174
+ # TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
175
+ # GENERATION_MODE: False
176
+
177
+ ########## Image Captioning ###########
178
+
179
+
180
+ # -
181
+ # NAME: cc12m_caption
182
+ # DATASETS:
183
+ # TRAIN: 'ImageTextPairDataset'
184
+ # TASK_TYPE: 'image_caption'
185
+ # DATASET_NAME: 'CC12M'
186
+ # TARGET_SET: ['Vocab_Word']
187
+ # DATALOADER:
188
+ # TRAIN_BATCH_SIZE: 300
189
+ # TEST_BATCH_SIZE: 32
190
+ # NUM_WORKERS: 2
191
+ # S3_ANNO_FOLDER: 's3://cc12m/'
192
+ # ANNO_FOLDER: 'open_source_dataset/c12m/'
193
+ # ANNO_FILENAME: 'train_available.json'
194
+ # FEATS_FOLDER: 'open_source_dataset/c12m/'
195
+ # S3_PATH: 's3://cc12m/'
196
+ # SEQ_PER_SAMPLE: 1
197
+ # SAMPLER: NodeDistributed
198
+ # CACHE_MODE: True
199
+ # CIRCULAR_CACHE_MODE: False
200
+ # ZIP_MODE: False
201
+ # CACHE_ORIGIN_IMAGE: False
202
+ # RANDOM_CAPTION: False
203
+ # AS_NUMPY_AS_POSSIBLE: False
204
+ # SAMPLING_WEIGHT: 1.6889
205
+ # TRANSFORM: 'clip_transforms'
206
+ # MODEL:
207
+ # MAX_SEQ_LEN: 50
208
+ # TEMP_NAME: logit_scale_caption
209
+ # LOSSES:
210
+ # NAMES: ['CrossEntropy', 'Accuracy']
211
+ # LOSS_WEIGHT: 0.33333
212
+ # REDUCTION: 'mean'
213
+ # INFERENCE:
214
+ # VOCAB: 'CLIP'
215
+ # GENERATION_MODE: False
216
+
217
+ # -
218
+ # NAME: cc3m_caption
219
+ # DATASETS:
220
+ # TRAIN: 'ImageTextPairDataset'
221
+ # TASK_TYPE: 'image_caption'
222
+ # DATASET_NAME: 'CC3M'
223
+ # TARGET_SET: ['Vocab_Word']
224
+ # DATALOADER:
225
+ # TRAIN_BATCH_SIZE: 300
226
+ # TEST_BATCH_SIZE: 32
227
+ # NUM_WORKERS: 2
228
+ # ANNO_FOLDER: 's3://cc3m/'
229
+ # ANNO_FILENAME: 'train_spacy.json'
230
+ # FEATS_FOLDER: 'open_source_dataset/cc3m/'
231
+ # S3_PATH: 's3://cc3m/'
232
+ # SEQ_PER_SAMPLE: 1
233
+ # SAMPLER: NodeDistributed
234
+ # CACHE_MODE: True
235
+ # CIRCULAR_CACHE_MODE: False
236
+ # ZIP_MODE: False
237
+ # CACHE_ORIGIN_IMAGE: False
238
+ # RANDOM_CAPTION: False
239
+ # AS_NUMPY_AS_POSSIBLE: False
240
+ # SAMPLING_WEIGHT: 0.8780
241
+ # TRANSFORM: 'clip_transforms'
242
+ # MODEL:
243
+ # MAX_SEQ_LEN: 50
244
+ # TEMP_NAME: logit_scale_caption
245
+ # LOSSES:
246
+ # NAMES: ['CrossEntropy', 'Accuracy']
247
+ # LOSS_WEIGHT: 0.33333
248
+ # REDUCTION: 'mean'
249
+ # INFERENCE:
250
+ # VOCAB: 'CLIP'
251
+ # GENERATION_MODE: False
252
+
253
+ # -
254
+ # NAME: vg_caption
255
+ # DATASETS:
256
+ # TRAIN: 'ImageTextPairDataset'
257
+ # TASK_TYPE: 'image_caption'
258
+ # DATASET_NAME: 'VG'
259
+ # TARGET_SET: ['Vocab_Word']
260
+ # DATALOADER:
261
+ # TRAIN_BATCH_SIZE: 300
262
+ # TEST_BATCH_SIZE: 32
263
+ # NUM_WORKERS: 2
264
+ # FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
265
+ # ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
266
+ # S3_PATH: 's3://visual_genome/images'
267
+ # ANNO_FILENAME: 'vg_captions_128filter.json'
268
+ # SEQ_PER_SAMPLE: 1
269
+ # CACHE_MODE: True
270
+ # CIRCULAR_CACHE_MODE: False
271
+ # ZIP_MODE: False
272
+ # CACHE_ORIGIN_IMAGE: False
273
+ # RANDOM_CAPTION: False
274
+ # AS_NUMPY_AS_POSSIBLE: False
275
+ # SAMPLING_WEIGHT: 0.5895
276
+ # TRANSFORM: 'clip_transforms'
277
+ # MODEL:
278
+ # MAX_SEQ_LEN: 30
279
+ # TEMP_NAME: logit_scale_caption
280
+ # LOSSES:
281
+ # NAMES: ['CrossEntropy', 'Accuracy']
282
+ # LOSS_WEIGHT: 0.33333
283
+ # REDUCTION: 'mean'
284
+ # INFERENCE:
285
+ # VOCAB: 'CLIP'
286
+ # GENERATION_MODE: True
287
+
288
+ -
289
+ NAME: mscoco_caption
290
+ DATASETS:
291
+ TRAIN: 'ImageTextPairDataset'
292
+ # VAL: 'ImageTextPairDataset'
293
+ TEST: 'ImageTextPairDataset'
294
+ TASK_TYPE: 'image_caption'
295
+ DATASET_NAME: 'MSCOCO'
296
+ TARGET_SET: ['Vocab_Word']
297
+ DATALOADER:
298
+ TRAIN_BATCH_SIZE: 32
299
+ TEST_BATCH_SIZE: 2
300
+ NUM_WORKERS: 4
301
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
302
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
303
+ S3_PATH: 's3://coco/'
304
+ SEQ_PER_SAMPLE: 1
305
+ CACHE_MODE: True
306
+ CIRCULAR_CACHE_MODE: False
307
+ ZIP_MODE: False
308
+ CACHE_ORIGIN_IMAGE: False
309
+ RANDOM_CAPTION: False
310
+ AS_NUMPY_AS_POSSIBLE: False
311
+ SAMPLING_WEIGHT: 0.3817
312
+ TRANSFORM: 'clip_transforms'
313
+ RANDOM_MASK: True
314
+ MODEL:
315
+ MAX_SEQ_LEN: 50
316
+ EVAL_MAX_SEQ_LEN: 21
317
+ TEMP_NAME: logit_scale_caption
318
+ LOSSES:
319
+ NAMES: ['CrossEntropy', 'Accuracy']
320
+ LOSS_WEIGHT: 0.33333
321
+ REDUCTION: 'mean'
322
+ DECODE_STRATEGY:
323
+ NAME: 'CaptionBeamSearcherV3'
324
+ BEAM_SIZE: 2
325
+ # LEN_PENALTY: 2.0
326
+ INFERENCE:
327
+ NAME: 'COCOEvaler'
328
+ VOCAB: 'CLIP'
329
+ ID_KEY: 'image_id'
330
+ VALUE: 'caption'
331
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
332
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
333
+ GENERATION_MODE: True
334
+
335
+ # -
336
+ # NAME: sbu_caption
337
+ # DATASETS:
338
+ # TRAIN: 'ImageTextPairDataset'
339
+ # TASK_TYPE: 'image_caption'
340
+ # DATASET_NAME: 'SBU'
341
+ # TARGET_SET: ['Vocab_Word']
342
+ # DATALOADER:
343
+ # TRAIN_BATCH_SIZE: 300
344
+ # TEST_BATCH_SIZE: 32
345
+ # NUM_WORKERS: 1
346
+ # S3_ANNO_FOLDER: 's3://SBU/annotations'
347
+ # ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
348
+ # ANNO_FILENAME: 'subcaption.json'
349
+ # FEATS_FOLDER: 'open_source_dataset/sbucaption/'
350
+ # S3_PATH: 's3://SBU/images'
351
+ # SEQ_PER_SAMPLE: 1
352
+ # SAMPLER: NodeDistributed
353
+ # CACHE_MODE: True
354
+ # CIRCULAR_CACHE_MODE: False
355
+ # ZIP_MODE: False
356
+ # CACHE_ORIGIN_IMAGE: False
357
+ # RANDOM_CAPTION: False
358
+ # AS_NUMPY_AS_POSSIBLE: False
359
+ # SAMPLING_WEIGHT: 0.4618
360
+ # TRANSFORM: 'clip_transforms'
361
+ # MODEL:
362
+ # MAX_SEQ_LEN: 50
363
+ # TEMP_NAME: logit_scale_caption
364
+ # LOSSES:
365
+ # NAMES: ['CrossEntropy', 'Accuracy']
366
+ # LOSS_WEIGHT: 0.33333
367
+ # REDUCTION: 'mean'
368
+ # INFERENCE:
369
+ # VOCAB: 'CLIP'
370
+ # GENERATION_MODE: False
371
+
372
+
373
+ ENGINE:
374
+ NAME: 'UnifiedTrainer'
375
+
376
+ MODEL:
377
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
378
+ ENCODER: 'UnifiedBertEncoder'
379
+
380
+ IN_TUNING: True # use IN1k instead of 22k
381
+ SHARE_LAYERNORM: True
382
+ BERT:
383
+ NORMALIZE_DECISION: "BERTPre"
384
+ DROP_PATH_PROB: 0.0
385
+ DROP_PATH_PROB_FIXED: True
386
+
387
+ MODEL_EMA: False
388
+ MODEL_EMA_DECAY: 0.9999
389
+
390
+ MAEParamsInit: True
391
+ POSEMBEDFIX: True
392
+
393
+
394
+ IMG_INPUT_SIZE: 224
395
+ PATCH_SIZE: 16
396
+
397
+ LAYER_SCALE: True
398
+ LAYER_SCALE_INIT: 1e-3
399
+
400
+
401
+ LAYER_SCALE_FP32: True
402
+ GATE_FP32: False
403
+ TAG_TRANSFORM_FP32: False
404
+
405
+
406
+ DATALOADER:
407
+ USE_WEIGHTED_SAMPLER: True
408
+ UNIFIED_DATASET: True
409
+ NUM_WORKERS: 32
410
+ STRATEGY: 'turn'
411
+
412
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
413
+
414
+
415
+
416
+ ####################################### Optimizer #######################################
417
+ SOLVER:
418
+ NAME: 'Adam'
419
+ TORCH_OPTIMIZER: True
420
+ PARAMS_SEPERATE: True
421
+ # PARAMS_GROUP: True
422
+ # EPOCH: 1
423
+ MAX_ITER: 150000
424
+ CHECKPOINT_PERIOD: 5000
425
+ EVAL_PERIOD: 500000
426
+ BASE_LR: 0.001
427
+ BIAS_LR_FACTOR: 1.0
428
+ WEIGHT_DECAY: 0.05
429
+ WEIGHT_DECAY_NORM: 0.0
430
+ WEIGHT_DECAY_BIAS: 0.0
431
+ WEIGHT_DECAY_EMBEDDING: 0.0
432
+ MOMENTUM: 0.9
433
+ DAMPENING: 0.0
434
+ NESTEROV: 0.0
435
+ BETAS: [0.9, 0.95]
436
+ EPS: 1e-6
437
+ GRAD_CLIP: 0.1
438
+ GRAD_CLIP_TYPE: 'norm'
439
+ ACCUM_ITER: 0
440
+ AMP_FP16: True
441
+ APEX_FP16: False # dangerous
442
+
443
+ WRITE_PERIOD: 50
444
+ MIN_LOSS_SCLE: 2048.0
445
+ # BF16: False # True
446
+ # ZEROSTAGE: 2
447
+
448
+ LOSS_SCALE_WINDOW: 200
449
+
450
+
451
+
452
+ FORCE_SOFTMAX_FP16: True
453
+ FORCE_LN_FP16: True
454
+ FORCE_NORM_FP16: True
455
+ # FORCE_TEMP_FP16: True
456
+ FORCE_EMBED_FP16: True
457
+
458
+
459
+
460
+
461
+
462
+
463
+ ####################################### lr scheduler #######################################
464
+ LR_SCHEDULER:
465
+ NAME: 'WarmupCosine'
466
+ WARMUP: 5000
467
+ MIN_LR: 0.000001
468
+
469
+
470
+
471
+
472
+ ####################################### evaluation #######################################
473
+ INFERENCE:
474
+
475
+ VOCAB: 'CLIP'
476
+ ITER_BASED: True
477
+
478
+
479
+ find_unused_parameters: true
480
+
481
+ # ENCODERS:
482
+ # -
483
+ # NAME: VisualEncoder
484
+ # TYPE: VisualEncoder
485
+ # DROP_PATH_PROB: 0.0
486
+ # HIDDEN_SIZE: 192
487
+ # HIDDEN_DROPOUT_PROB: 0.
488
+ # HIDDEN_ACT: "gelu"
489
+ # NUM_ATTENTION_HEADS: 3
490
+ # INTERMEDIATE_SIZE: 768
491
+ # INTERMEDIATE_DROP: 0.
492
+ # FFN_DROPOUT_PROB: 0.
493
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
494
+ # NUM_HIDDEN_LAYERS: 6
495
+ # NUM_GENERATION_LAYERS: 0
496
+ # DROP_PATH_PROB_FIXED: True
497
+
498
+ # -
499
+ # NAME: TextEncoder
500
+ # TYPE: TextEncoder
501
+ # DROP_PATH_PROB: 0.0
502
+ # HIDDEN_SIZE: 192
503
+ # HIDDEN_DROPOUT_PROB: 0.
504
+ # HIDDEN_ACT: "gelu"
505
+ # NUM_ATTENTION_HEADS: 3
506
+ # INTERMEDIATE_SIZE: 768
507
+ # INTERMEDIATE_DROP: 0.
508
+ # FFN_DROPOUT_PROB: 0.
509
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
510
+ # NUM_HIDDEN_LAYERS: 6
511
+ # NUM_GENERATION_LAYERS: 0
512
+ # DROP_PATH_PROB_FIXED: True
513
+
514
+ MOE:
515
+ MOE: True
516
+ MOE_TYPE: 'attribute'
517
+ TAG_Transform: True
518
+ ATTRIBUTE_LENGTH: 8
519
+ EP_WORLD_SIZE: 1 # tag moe only
520
+ NUM_EXPERTS: 8
521
+ TOP_K: 2
522
+ CAPACITY_FACTOR: 3.0
523
+ EVAL_MIN_CAPACITY: 4.0
524
+ MIN_CAPACITY: 4
525
+ NOISY_GATE_POLICY: 'vmoe'
526
+ MOE_PARAM_GROUP: True
527
+ MOE_EXPERT_TYPE: 'FFN,SA'
528
+ SA_LINEAR_OUT_MOE: True
529
+ MOE_EXPERT_LOCATION: 'all' # 'odd'
530
+ # MOE_LAYER_START_IDX: 3
531
+ # MOE_LAYER_END_IDX: 21
532
+ # MOE_LAYER_START_IDX: 18
533
+ # MOE_LAYER_END_IDX: 12
534
+ BATCH_PRIO: True
535
+ USE_TUTEL: True
536
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H192_experiments/moe_debug_load_ds_checkpoint.yaml ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ # -
6
+ # NAME: 'ImageNet1k'
7
+ # SHARED_TARGETS_CFG:
8
+ # FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ # DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ # -
18
+ # NAME: 'Kinetics400'
19
+ # SHARED_TARGETS_CFG:
20
+ # FILE_PATH: 'open_source_dataset/k400_class_name_CLIP_with_endoftext.pkl'
21
+ # DISTRIBUTED: False
22
+
23
+
24
+
25
+ TASKS:
26
+
27
+ # -
28
+ # NAME: imagenet
29
+ # DATASETS:
30
+ # TRAIN: 'ImageNetDataset'
31
+ # VAL: 'ImageNetDataset'
32
+ # TASK_TYPE: 'image_classification'
33
+ # DATASET_NAME: 'ImageNet1k'
34
+ # TARGET_SET: ['ImageNet1k']
35
+
36
+ # DATALOADER:
37
+ # TRAIN_BATCH_SIZE: 720
38
+ # # TEST_BATCH_SIZE: 2
39
+ # NUM_WORKERS: 4
40
+ # FEATS_FOLDER: 'cluster2:s3://imagenet'
41
+ # ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
42
+ # SAMPLING_WEIGHT: 2.5
43
+ # CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
44
+ # MIXUP: 0.8
45
+ # CUTMIX: 1.0
46
+ # MIXUP_PROB: 1.0
47
+ # MIXUP_SWITCH_PROB: 0.5
48
+ # MIXUP_MODE: 'batch'
49
+ # MIXUP_LABEL_SMOOTHING: 0.1
50
+ # MODEL:
51
+ # MAX_SEQ_LEN: -1
52
+ # LABELS_NUM: 1000
53
+ # TEMP_NAME: logit_scale_img_cls
54
+ # LOSSES:
55
+ # NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
56
+ # LOSS_WEIGHT: 1.0
57
+ # REDUCTION: 'mean'
58
+ # # LOSS_FP32: True
59
+ # INFERENCE:
60
+ # NAME: 'ImageNetEvaler'
61
+ # ID_KEY: 'image_id'
62
+ # VALUE: 'cls_logits'
63
+ # VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
64
+ # TEST_ANNFILE: ''
65
+ # GENERATION_MODE: False
66
+
67
+ # -
68
+ # NAME: K400_retrieve
69
+ # DATASETS:
70
+ # TRAIN: 'VideoDataSet'
71
+ # VAL: 'VideoDataSet'
72
+ # TASK_TYPE: 'video_classification'
73
+ # DATASET_NAME: 'K400'
74
+ # TARGET_SET: ['Kinetics400']
75
+ # DATALOADER:
76
+ # TRAIN_BATCH_SIZE: 12 # 256
77
+ # TEST_BATCH_SIZE: 4 # debug
78
+ # NUM_WORKERS: 4 # debug 4
79
+ # FEATS_FOLDER: 'open_source_dataset/K400_official'
80
+ # ANNO_FOLDER: 'open_source_dataset/K400_official'
81
+ # S3_PATH: 's3://K400/'
82
+ # FRAMES_PER_CLIP: 8
83
+ # STRIDE: 32
84
+ # FILE_EXTENSION: ''
85
+ # ANNO_FILE: 'annotation.json'
86
+ # TIMESFORMER_AUG: True
87
+ # SAMPLING_WEIGHT: 1.0
88
+ # MODEL:
89
+ # MAX_SEQ_LEN: -1
90
+ # TEMP_NAME: logit_scale_video_cls
91
+ # LOSSES:
92
+ # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
93
+ # LABELSMOOTHING: 0.1
94
+ # LOSS_WEIGHT: 1.0
95
+ # INFERENCE:
96
+ # NAME: 'MiTEvaler'
97
+ # ID_KEY: 'video_name'
98
+ # VALUE: 'label'
99
+ # VAL_ANNFILE: 'open_source_dataset/K400_official/annotation.json'
100
+ # TEST_ANNFILE: ''
101
+ # GENERATION_MODE: False
102
+ # NUM_VIEWS: 1
103
+
104
+ # -
105
+ # NAME: bookswiki_pretrain
106
+ # DATASETS:
107
+ # TRAIN: 'GeneralCorpusDataset'
108
+ # TASK_TYPE: 'text_mlm'
109
+ # DATASET_NAME: 'BooksWiki'
110
+ # TARGET_SET: ['Vocab_Word']
111
+ # VERSION: 'v2'
112
+ # DATALOADER:
113
+ # TRAIN_BATCH_SIZE: 512
114
+ # TEST_BATCH_SIZE: 32
115
+ # NUM_WORKERS: 2
116
+ # ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
117
+ # # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
118
+ # SEQ_PER_SAMPLE: 1
119
+ # SAMPLER: NodeDistributed
120
+ # CACHE_MODE: True
121
+ # SEQ_PER_SAMPLE: 128
122
+ # MIN_SEQ_PER_SAMPLE: 128
123
+ # APPEND_EOS: True
124
+ # ONE_STREAM: False
125
+ # SAMPLING_WEIGHT: 3.5
126
+ # RANDOM_MASK: True
127
+ # MODEL:
128
+ # MAX_SEQ_LEN: 128
129
+ # TEMP_NAME: logit_scale_text_mlm
130
+ # LOSSES:
131
+ # NAMES: ['CrossEntropy', 'Accuracy']
132
+ # LOSS_WEIGHT: 0.33333
133
+ # REDUCTION: 'mean'
134
+ # INFERENCE:
135
+ # VOCAB: 'CLIP'
136
+ # GENERATION_MODE: False
137
+ # -
138
+ # NAME: mscoco_retrieve
139
+ # DATASETS:
140
+ # TRAIN: 'ImageTextPairDataset'
141
+ # TEST: 'ImageTextPairDataset'
142
+ # TASK_TYPE: 'image_retrieval'
143
+ # DATASET_NAME: 'MSCOCO'
144
+ # DATALOADER:
145
+ # TRAIN_BATCH_SIZE: 100
146
+ # TEST_BATCH_SIZE: 32
147
+ # NUM_WORKERS: 1
148
+ # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
149
+ # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
150
+ # S3_PATH: 's3://coco/'
151
+ # SEQ_PER_SAMPLE: 1
152
+ # CACHE_MODE: True
153
+ # CIRCULAR_CACHE_MODE: False
154
+ # ZIP_MODE: False
155
+ # CACHE_ORIGIN_IMAGE: False
156
+ # RANDOM_CAPTION: False
157
+ # AS_NUMPY_AS_POSSIBLE: False
158
+ # SAMPLING_WEIGHT: 1.0
159
+ # TRANSFORM: 'clip_transforms'
160
+ # MODEL:
161
+ # MAX_SEQ_LEN: 50
162
+ # TEMP_NAME: logit_scale_retrieve
163
+ # LOSSES:
164
+ # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
165
+ # LABELSMOOTHING: 0.1
166
+ # LOSS_WEIGHT: 1.0
167
+ # REDUCTION: 'mean'
168
+ # INFERENCE:
169
+ # VOCAB: 'CLIP'
170
+ # ID_KEY: 'image_id'
171
+ # VALUE: 'caption'
172
+ # NAME: 'RetrievalEvaler'
173
+ # VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
174
+ # TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
175
+ # GENERATION_MODE: False
176
+
177
+ ########## Image Captioning ###########
178
+
179
+
180
+ # -
181
+ # NAME: cc12m_caption
182
+ # DATASETS:
183
+ # TRAIN: 'ImageTextPairDataset'
184
+ # TASK_TYPE: 'image_caption'
185
+ # DATASET_NAME: 'CC12M'
186
+ # TARGET_SET: ['Vocab_Word']
187
+ # DATALOADER:
188
+ # TRAIN_BATCH_SIZE: 300
189
+ # TEST_BATCH_SIZE: 32
190
+ # NUM_WORKERS: 2
191
+ # S3_ANNO_FOLDER: 's3://cc12m/'
192
+ # ANNO_FOLDER: 'open_source_dataset/c12m/'
193
+ # ANNO_FILENAME: 'train_available.json'
194
+ # FEATS_FOLDER: 'open_source_dataset/c12m/'
195
+ # S3_PATH: 's3://cc12m/'
196
+ # SEQ_PER_SAMPLE: 1
197
+ # SAMPLER: NodeDistributed
198
+ # CACHE_MODE: True
199
+ # CIRCULAR_CACHE_MODE: False
200
+ # ZIP_MODE: False
201
+ # CACHE_ORIGIN_IMAGE: False
202
+ # RANDOM_CAPTION: False
203
+ # AS_NUMPY_AS_POSSIBLE: False
204
+ # SAMPLING_WEIGHT: 1.6889
205
+ # TRANSFORM: 'clip_transforms'
206
+ # MODEL:
207
+ # MAX_SEQ_LEN: 50
208
+ # TEMP_NAME: logit_scale_caption
209
+ # LOSSES:
210
+ # NAMES: ['CrossEntropy', 'Accuracy']
211
+ # LOSS_WEIGHT: 0.33333
212
+ # REDUCTION: 'mean'
213
+ # INFERENCE:
214
+ # VOCAB: 'CLIP'
215
+ # GENERATION_MODE: False
216
+
217
+ # -
218
+ # NAME: cc3m_caption
219
+ # DATASETS:
220
+ # TRAIN: 'ImageTextPairDataset'
221
+ # TASK_TYPE: 'image_caption'
222
+ # DATASET_NAME: 'CC3M'
223
+ # TARGET_SET: ['Vocab_Word']
224
+ # DATALOADER:
225
+ # TRAIN_BATCH_SIZE: 300
226
+ # TEST_BATCH_SIZE: 32
227
+ # NUM_WORKERS: 2
228
+ # S3_ANNO_FOLDER: 's3://cc3m/'
229
+ # ANNO_FOLDER: 'open_source_dataset/cc3m/'
230
+ # ANNO_FILENAME: 'train_spacy.json'
231
+ # FEATS_FOLDER: 'open_source_dataset/cc3m/'
232
+ # S3_PATH: 's3://cc3m/'
233
+ # SEQ_PER_SAMPLE: 1
234
+ # SAMPLER: NodeDistributed
235
+ # CACHE_MODE: True
236
+ # CIRCULAR_CACHE_MODE: False
237
+ # ZIP_MODE: False
238
+ # CACHE_ORIGIN_IMAGE: False
239
+ # RANDOM_CAPTION: False
240
+ # AS_NUMPY_AS_POSSIBLE: False
241
+ # SAMPLING_WEIGHT: 0.8780
242
+ # TRANSFORM: 'clip_transforms'
243
+ # MODEL:
244
+ # MAX_SEQ_LEN: 50
245
+ # TEMP_NAME: logit_scale_caption
246
+ # LOSSES:
247
+ # NAMES: ['CrossEntropy', 'Accuracy']
248
+ # LOSS_WEIGHT: 0.33333
249
+ # REDUCTION: 'mean'
250
+ # INFERENCE:
251
+ # VOCAB: 'CLIP'
252
+ # GENERATION_MODE: False
253
+
254
+ # -
255
+ # NAME: vg_caption
256
+ # DATASETS:
257
+ # TRAIN: 'ImageTextPairDataset'
258
+ # TASK_TYPE: 'image_caption'
259
+ # DATASET_NAME: 'VG'
260
+ # TARGET_SET: ['Vocab_Word']
261
+ # DATALOADER:
262
+ # TRAIN_BATCH_SIZE: 300
263
+ # TEST_BATCH_SIZE: 32
264
+ # NUM_WORKERS: 2
265
+ # FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
266
+ # ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
267
+ # S3_PATH: 's3://visual_genome/images'
268
+ # ANNO_FILENAME: 'vg_captions_128filter.json'
269
+ # SEQ_PER_SAMPLE: 1
270
+ # CACHE_MODE: True
271
+ # CIRCULAR_CACHE_MODE: False
272
+ # ZIP_MODE: False
273
+ # CACHE_ORIGIN_IMAGE: False
274
+ # RANDOM_CAPTION: False
275
+ # AS_NUMPY_AS_POSSIBLE: False
276
+ # SAMPLING_WEIGHT: 0.5895
277
+ # TRANSFORM: 'clip_transforms'
278
+ # MODEL:
279
+ # MAX_SEQ_LEN: 30
280
+ # TEMP_NAME: logit_scale_caption
281
+ # LOSSES:
282
+ # NAMES: ['CrossEntropy', 'Accuracy']
283
+ # LOSS_WEIGHT: 0.33333
284
+ # REDUCTION: 'mean'
285
+ # INFERENCE:
286
+ # VOCAB: 'CLIP'
287
+ # GENERATION_MODE: True
288
+
289
+ -
290
+ NAME: mscoco_caption
291
+ DATASETS:
292
+ TRAIN: 'ImageTextPairDataset'
293
+ # VAL: 'ImageTextPairDataset'
294
+ TEST: 'ImageTextPairDataset'
295
+ TASK_TYPE: 'image_caption'
296
+ DATASET_NAME: 'MSCOCO'
297
+ TARGET_SET: ['Vocab_Word']
298
+ DATALOADER:
299
+ TRAIN_BATCH_SIZE: 32
300
+ TEST_BATCH_SIZE: 2
301
+ NUM_WORKERS: 4
302
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
303
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
304
+ S3_PATH: 's3://coco/'
305
+ SEQ_PER_SAMPLE: 1
306
+ CACHE_MODE: True
307
+ CIRCULAR_CACHE_MODE: False
308
+ ZIP_MODE: False
309
+ CACHE_ORIGIN_IMAGE: False
310
+ RANDOM_CAPTION: False
311
+ AS_NUMPY_AS_POSSIBLE: False
312
+ SAMPLING_WEIGHT: 0.3817
313
+ TRANSFORM: 'clip_transforms'
314
+ RANDOM_MASK: True
315
+ MODEL:
316
+ MAX_SEQ_LEN: 50
317
+ EVAL_MAX_SEQ_LEN: 21
318
+ TEMP_NAME: logit_scale_caption
319
+ LOSSES:
320
+ NAMES: ['CrossEntropy', 'Accuracy']
321
+ LOSS_WEIGHT: 0.33333
322
+ REDUCTION: 'mean'
323
+ DECODE_STRATEGY:
324
+ NAME: 'CaptionBeamSearcherV3'
325
+ BEAM_SIZE: 2
326
+ # LEN_PENALTY: 2.0
327
+ INFERENCE:
328
+ NAME: 'COCOEvaler'
329
+ VOCAB: 'CLIP'
330
+ ID_KEY: 'image_id'
331
+ VALUE: 'caption'
332
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
333
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
334
+ GENERATION_MODE: True
335
+
336
+ # -
337
+ # NAME: sbu_caption
338
+ # DATASETS:
339
+ # TRAIN: 'ImageTextPairDataset'
340
+ # TASK_TYPE: 'image_caption'
341
+ # DATASET_NAME: 'SBU'
342
+ # TARGET_SET: ['Vocab_Word']
343
+ # DATALOADER:
344
+ # TRAIN_BATCH_SIZE: 300
345
+ # TEST_BATCH_SIZE: 32
346
+ # NUM_WORKERS: 1
347
+ # S3_ANNO_FOLDER: 's3://SBU/annotations'
348
+ # ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
349
+ # ANNO_FILENAME: 'subcaption.json'
350
+ # FEATS_FOLDER: 'open_source_dataset/sbucaption/'
351
+ # S3_PATH: 's3://SBU/images'
352
+ # SEQ_PER_SAMPLE: 1
353
+ # SAMPLER: NodeDistributed
354
+ # CACHE_MODE: True
355
+ # CIRCULAR_CACHE_MODE: False
356
+ # ZIP_MODE: False
357
+ # CACHE_ORIGIN_IMAGE: False
358
+ # RANDOM_CAPTION: False
359
+ # AS_NUMPY_AS_POSSIBLE: False
360
+ # SAMPLING_WEIGHT: 0.4618
361
+ # TRANSFORM: 'clip_transforms'
362
+ # MODEL:
363
+ # MAX_SEQ_LEN: 50
364
+ # TEMP_NAME: logit_scale_caption
365
+ # LOSSES:
366
+ # NAMES: ['CrossEntropy', 'Accuracy']
367
+ # LOSS_WEIGHT: 0.33333
368
+ # REDUCTION: 'mean'
369
+ # INFERENCE:
370
+ # VOCAB: 'CLIP'
371
+ # GENERATION_MODE: False
372
+
373
+
374
+ ENGINE:
375
+ NAME: 'UnifiedTrainer'
376
+
377
+ MODEL:
378
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
379
+ ENCODER: 'UnifiedBertEncoder'
380
+
381
+ IN_TUNING: True # use IN1k instead of 22k
382
+ SHARE_LAYERNORM: True
383
+ BERT:
384
+ NORMALIZE_DECISION: "BERTPre"
385
+ DROP_PATH_PROB: 0.0
386
+ DROP_PATH_PROB_FIXED: True
387
+
388
+ MODEL_EMA: False
389
+ MODEL_EMA_DECAY: 0.9999
390
+
391
+ MAEParamsInit: True
392
+ POSEMBEDFIX: True
393
+
394
+
395
+ IMG_INPUT_SIZE: 224
396
+ PATCH_SIZE: 16
397
+
398
+ LAYER_SCALE: True
399
+ LAYER_SCALE_INIT: 1e-3
400
+
401
+ VIDEO_EMBED:
402
+ ADD_TYPE_EMBED: True
403
+
404
+
405
+ DATALOADER:
406
+ USE_WEIGHTED_SAMPLER: True
407
+ UNIFIED_DATASET: True
408
+ NUM_WORKERS: 32
409
+ STRATEGY: 'turn'
410
+
411
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
412
+
413
+
414
+
415
+ ####################################### Optimizer #######################################
416
+ SOLVER:
417
+ NAME: 'Adam'
418
+ TORCH_OPTIMIZER: True
419
+ PARAMS_SEPERATE: True
420
+ # PARAMS_GROUP: True
421
+ # EPOCH: 1
422
+ MAX_ITER: 150000
423
+ CHECKPOINT_PERIOD: 5000
424
+ EVAL_PERIOD: 500000
425
+ BASE_LR: 0.001
426
+ BIAS_LR_FACTOR: 1.0
427
+ WEIGHT_DECAY: 0.05
428
+ WEIGHT_DECAY_NORM: 0.0
429
+ WEIGHT_DECAY_BIAS: 0.0
430
+ WEIGHT_DECAY_EMBEDDING: 0.0
431
+ MOMENTUM: 0.9
432
+ DAMPENING: 0.0
433
+ NESTEROV: 0.0
434
+ BETAS: [0.9, 0.95]
435
+ EPS: 1e-6
436
+ GRAD_CLIP: 0.1
437
+ GRAD_CLIP_TYPE: 'norm'
438
+ ACCUM_ITER: 0
439
+ AMP_FP16: True
440
+ APEX_FP16: False # dangerous
441
+
442
+ WRITE_PERIOD: 50
443
+ MIN_LOSS_SCLE: 2048.0
444
+ # BF16: False # True
445
+ # ZEROSTAGE: 2
446
+
447
+ LOSS_SCALE_WINDOW: 200
448
+
449
+ FORCE_SOFTMAX_FP16: True
450
+ FORCE_LN_FP16: True
451
+ FORCE_NORM_FP16: True
452
+ # FORCE_TEMP_FP16: True
453
+ FORCE_EMBED_FP16: True
454
+
455
+ # # used for debug only
456
+ FORCE_WG_RECAST: True
457
+ FORCE_EXPERT_ADDING_FP16: True
458
+
459
+ # !!! note that the VIDEO_EMBED.ADD_TYPE_EMBED=True is current config
460
+
461
+
462
+
463
+
464
+
465
+
466
+ ####################################### lr scheduler #######################################
467
+ LR_SCHEDULER:
468
+ NAME: 'WarmupCosine'
469
+ WARMUP: 5000
470
+ MIN_LR: 0.000001
471
+
472
+
473
+
474
+
475
+ ####################################### evaluation #######################################
476
+ INFERENCE:
477
+
478
+ VOCAB: 'CLIP'
479
+ ITER_BASED: True
480
+
481
+
482
+ find_unused_parameters: true
483
+
484
+ # ENCODERS:
485
+ # -
486
+ # NAME: VisualEncoder
487
+ # TYPE: VisualEncoder
488
+ # DROP_PATH_PROB: 0.0
489
+ # HIDDEN_SIZE: 192
490
+ # HIDDEN_DROPOUT_PROB: 0.
491
+ # HIDDEN_ACT: "gelu"
492
+ # NUM_ATTENTION_HEADS: 3
493
+ # INTERMEDIATE_SIZE: 768
494
+ # INTERMEDIATE_DROP: 0.
495
+ # FFN_DROPOUT_PROB: 0.
496
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
497
+ # NUM_HIDDEN_LAYERS: 6
498
+ # NUM_GENERATION_LAYERS: 0
499
+ # DROP_PATH_PROB_FIXED: True
500
+
501
+ # -
502
+ # NAME: TextEncoder
503
+ # TYPE: TextEncoder
504
+ # DROP_PATH_PROB: 0.0
505
+ # HIDDEN_SIZE: 192
506
+ # HIDDEN_DROPOUT_PROB: 0.
507
+ # HIDDEN_ACT: "gelu"
508
+ # NUM_ATTENTION_HEADS: 3
509
+ # INTERMEDIATE_SIZE: 768
510
+ # INTERMEDIATE_DROP: 0.
511
+ # FFN_DROPOUT_PROB: 0.
512
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
513
+ # NUM_HIDDEN_LAYERS: 6
514
+ # NUM_GENERATION_LAYERS: 0
515
+ # DROP_PATH_PROB_FIXED: True
516
+
517
+
518
+
519
+ MOE:
520
+ MOE: True
521
+ MOE_TYPE: 'attribute'
522
+ TAG_Transform: True
523
+ ATTRIBUTE_LENGTH: 8
524
+ EP_WORLD_SIZE: 1 # tag moe only
525
+ NUM_EXPERTS: 8
526
+ TOP_K: 2
527
+ CAPACITY_FACTOR: 3.0
528
+ EVAL_MIN_CAPACITY: 4.0
529
+ MIN_CAPACITY: 4
530
+ NOISY_GATE_POLICY: 'vmoe'
531
+ MOE_PARAM_GROUP: True
532
+ MOE_EXPERT_TYPE: 'FFN,SA'
533
+ SA_LINEAR_OUT_MOE: True
534
+ MOE_EXPERT_LOCATION: 'all' # 'odd'
535
+ # MOE_LAYER_START_IDX: 3
536
+ # MOE_LAYER_END_IDX: 21
537
+ # MOE_LAYER_START_IDX: 18
538
+ # MOE_LAYER_END_IDX: 12
539
+ BATCH_PRIO: True
540
+ USE_TUTEL: True
541
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H192_experiments/mscoco_caption_debug.yaml ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+
6
+ -
7
+ NAME: 'Vocab_Word'
8
+ SHARED_TARGETS_CFG:
9
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
10
+ DISTRIBUTED: True
11
+
12
+
13
+
14
+ TASKS:
15
+
16
+ -
17
+ NAME: mscoco_retrieve
18
+ DATASETS:
19
+ TRAIN: 'ImageTextPairDataset'
20
+ TEST: 'ImageTextPairDataset'
21
+ TASK_TYPE: 'image_retrieval'
22
+ DATASET_NAME: 'MSCOCO'
23
+ DATALOADER:
24
+ TRAIN_BATCH_SIZE: 100
25
+ TEST_BATCH_SIZE: 32
26
+ NUM_WORKERS: 1
27
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
28
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
29
+ S3_PATH: 's3://coco/'
30
+ SEQ_PER_SAMPLE: 1
31
+ CACHE_MODE: True
32
+ CIRCULAR_CACHE_MODE: False
33
+ ZIP_MODE: False
34
+ CACHE_ORIGIN_IMAGE: False
35
+ RANDOM_CAPTION: False
36
+ AS_NUMPY_AS_POSSIBLE: False
37
+ SAMPLING_WEIGHT: 1.0
38
+ TRANSFORM: 'clip_transforms'
39
+ MODEL:
40
+ MAX_SEQ_LEN: 50
41
+ TEMP_NAME: logit_scale_retrieve
42
+ LOSSES:
43
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
44
+ LABELSMOOTHING: 0.1
45
+ LOSS_WEIGHT: 1.0
46
+ REDUCTION: 'mean'
47
+ INFERENCE:
48
+ VOCAB: 'CLIP'
49
+ ID_KEY: 'image_id'
50
+ VALUE: 'caption'
51
+ NAME: 'RetrievalEvaler'
52
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
53
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
54
+ GENERATION_MODE: False
55
+
56
+
57
+ -
58
+ NAME: mscoco_caption
59
+ DATASETS:
60
+ TRAIN: 'ImageTextPairDataset'
61
+ # VAL: 'ImageTextPairDataset'
62
+ TEST: 'ImageTextPairDataset'
63
+ TASK_TYPE: 'image_caption'
64
+ DATASET_NAME: 'MSCOCO'
65
+ TARGET_SET: ['Vocab_Word']
66
+ DATALOADER:
67
+ TRAIN_BATCH_SIZE: 300
68
+ TEST_BATCH_SIZE: 32
69
+ NUM_WORKERS: 4
70
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
71
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
72
+ S3_PATH: 's3://coco/'
73
+ SEQ_PER_SAMPLE: 1
74
+ CACHE_MODE: True
75
+ CIRCULAR_CACHE_MODE: False
76
+ ZIP_MODE: False
77
+ CACHE_ORIGIN_IMAGE: False
78
+ RANDOM_CAPTION: False
79
+ AS_NUMPY_AS_POSSIBLE: False
80
+ SAMPLING_WEIGHT: 0.3817
81
+ TRANSFORM: 'clip_transforms'
82
+ RANDOM_MASK: True
83
+ MODEL:
84
+ MAX_SEQ_LEN: 50
85
+ EVAL_MAX_SEQ_LEN: 21
86
+ TEMP_NAME: logit_scale_caption
87
+ LOSSES:
88
+ NAMES: ['CrossEntropy', 'Accuracy']
89
+ LOSS_WEIGHT: 0.33333
90
+ REDUCTION: 'mean'
91
+ DECODE_STRATEGY:
92
+ NAME: 'CaptionBeamSearcherV3'
93
+ BEAM_SIZE: 2
94
+ # LEN_PENALTY: 1.0
95
+ INFERENCE:
96
+ NAME: 'COCOEvaler'
97
+ VOCAB: 'CLIP'
98
+ ID_KEY: 'image_id'
99
+ VALUE: 'caption'
100
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
101
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
102
+ GENERATION_MODE: True
103
+
104
+
105
+
106
+
107
+
108
+ ENGINE:
109
+ NAME: 'UnifiedTrainer'
110
+
111
+ MODEL:
112
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
113
+ ENCODER: 'UnifiedBertEncoder'
114
+
115
+ IN_TUNING: True # use IN1k instead of 22k
116
+ SHARE_LAYERNORM: True
117
+ BERT:
118
+ NORMALIZE_DECISION: "BERTPre"
119
+ DROP_PATH_PROB: 0.0
120
+ DROP_PATH_PROB_FIXED: True
121
+
122
+ MODEL_EMA: False
123
+ MODEL_EMA_DECAY: 0.9999
124
+
125
+ MAEParamsInit: True
126
+ POSEMBEDFIX: True
127
+
128
+
129
+ IMG_INPUT_SIZE: 224
130
+ PATCH_SIZE: 16
131
+
132
+ LAYER_SCALE: True
133
+ LAYER_SCALE_INIT: 1e-3
134
+
135
+
136
+ DATALOADER:
137
+ USE_WEIGHTED_SAMPLER: True
138
+ UNIFIED_DATASET: True
139
+ NUM_WORKERS: 32
140
+
141
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
142
+
143
+
144
+
145
+ ####################################### Optimizer #######################################
146
+ SOLVER:
147
+ NAME: 'Adam'
148
+ TORCH_OPTIMIZER: True
149
+ PARAMS_SEPERATE: True
150
+ # PARAMS_GROUP: True
151
+ # EPOCH: 1
152
+ MAX_ITER: 150000
153
+ CHECKPOINT_PERIOD: 5000
154
+ EVAL_PERIOD: 500000
155
+ BASE_LR: 0.001
156
+ BIAS_LR_FACTOR: 1.0
157
+ WEIGHT_DECAY: 0.05
158
+ WEIGHT_DECAY_NORM: 0.0
159
+ WEIGHT_DECAY_BIAS: 0.0
160
+ WEIGHT_DECAY_EMBEDDING: 0.0
161
+ MOMENTUM: 0.9
162
+ DAMPENING: 0.0
163
+ NESTEROV: 0.0
164
+ BETAS: [0.9, 0.95]
165
+ EPS: 1e-6
166
+ GRAD_CLIP: 0.1
167
+ GRAD_CLIP_TYPE: 'norm'
168
+ ACCUM_ITER: 0
169
+ AMP_FP16: True
170
+ APEX_FP16: False # dangerous
171
+
172
+ WRITE_PERIOD: 50
173
+ MIN_LOSS_SCLE: 2048.0
174
+ # BF16: False # True
175
+ # ZEROSTAGE: 2
176
+
177
+ LOSS_SCALE_WINDOW: 200
178
+
179
+
180
+
181
+
182
+
183
+
184
+ ####################################### lr scheduler #######################################
185
+ LR_SCHEDULER:
186
+ NAME: 'WarmupCosine'
187
+ WARMUP: 5000
188
+ MIN_LR: 0.000001
189
+
190
+
191
+
192
+
193
+ ####################################### evaluation #######################################
194
+ INFERENCE:
195
+
196
+ VOCAB: 'CLIP'
197
+ ITER_BASED: True
198
+
199
+
200
+ find_unused_parameters: true
201
+
202
+ # ENCODERS:
203
+ # -
204
+ # NAME: VisualEncoder
205
+ # TYPE: VisualEncoder
206
+ # DROP_PATH_PROB: 0.0
207
+ # HIDDEN_SIZE: 192
208
+ # HIDDEN_DROPOUT_PROB: 0.
209
+ # HIDDEN_ACT: "gelu"
210
+ # NUM_ATTENTION_HEADS: 3
211
+ # INTERMEDIATE_SIZE: 768
212
+ # INTERMEDIATE_DROP: 0.
213
+ # FFN_DROPOUT_PROB: 0.
214
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
215
+ # NUM_HIDDEN_LAYERS: 6
216
+ # NUM_GENERATION_LAYERS: 0
217
+ # DROP_PATH_PROB_FIXED: True
218
+
219
+ # -
220
+ # NAME: TextEncoder
221
+ # TYPE: TextEncoder
222
+ # DROP_PATH_PROB: 0.0
223
+ # HIDDEN_SIZE: 192
224
+ # HIDDEN_DROPOUT_PROB: 0.
225
+ # HIDDEN_ACT: "gelu"
226
+ # NUM_ATTENTION_HEADS: 3
227
+ # INTERMEDIATE_SIZE: 768
228
+ # INTERMEDIATE_DROP: 0.
229
+ # FFN_DROPOUT_PROB: 0.
230
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
231
+ # NUM_HIDDEN_LAYERS: 6
232
+ # NUM_GENERATION_LAYERS: 0
233
+ # DROP_PATH_PROB_FIXED: True
234
+
configs/BERT_L12_H192_experiments/vqa_debug.yaml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h192.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+
6
+ -
7
+ NAME: 'VQA_Answer'
8
+ SHARED_TARGETS_CFG:
9
+ FILE_PATH: 'open_source_dataset/VQA_Answers_CLIP_with_endoftext.pkl'
10
+ DISTRIBUTED: True
11
+
12
+ TASKS:
13
+ -
14
+ NAME: vqa
15
+ DATASETS:
16
+ TRAIN: 'VQADataset'
17
+ VAL: 'VQADataset'
18
+ DATASET_NAME: 'VQA'
19
+ TASK_TYPE: 'vqa'
20
+ TARGET_SET: ['VQA_Answer']
21
+ DATALOADER:
22
+ TRAIN_BATCH_SIZE: 256
23
+ TEST_BATCH_SIZE: 128
24
+ NUM_WORKERS: 4
25
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
26
+ ANNO_FOLDER: 'open_source_dataset/VQA'
27
+ SEQ_PER_SAMPLE: 1
28
+ MAX_FEAT_NUM: 51
29
+ SAMPLING_WEIGHT: 1.0
30
+ TRANSFORM: 'clip_transforms'
31
+ DO_AS_GEN: True
32
+ SINGLE_CLASS: True
33
+ MODEL:
34
+ # VOCAB_SIZE: 49409 # include <BOS>/<EOS>
35
+ PREDICTOR: 'MLPClassifer'
36
+ # MM_PREDICTOR:
37
+ # LABELS_NUM: 3129
38
+ # PREDICT: 'first_one'
39
+ # PRED_DROPOUT: 0.5
40
+ MAX_SEQ_LEN: 23
41
+ # QUERY_EMBED:
42
+ # NAME: QueryBaseEmbedding
43
+ # DIM: 512
44
+ # QUERY_SIZE: 10 # more than 1 is ok
45
+ # ACTIVATION: 'none'
46
+ # USE_NORM: True
47
+ # DROPOUT: 0.1
48
+ # POSITION: 'none' # must be none now
49
+ # TYPE_VOCAB_SIZE: -1 # must < 0
50
+ LOSSES:
51
+ # not single class
52
+ # NAMES: ['BCEWithLogits']
53
+ # LOSS_WEIGHT: 0.05
54
+ # for single class
55
+ NAMES: ['CrossEntropy']
56
+ LOSS_WEIGHT: 0.1
57
+ INFERENCE:
58
+ VOCAB: 'CLIP'
59
+ NAME: 'VQAEvaler'
60
+ ID_KEY: 'question_id'
61
+ VALUE: 'answer'
62
+ VAL_ANNFILE: 'open_source_dataset/VQA/val_target.pkl'
63
+ TEST_ANNFILE: ''
64
+ GENERATION_MODE: False
65
+
66
+
67
+ ######################################### Engine #########################################
68
+ ENGINE:
69
+ NAME: 'UnifiedTrainer'
70
+
71
+ ######################################### Scheduled sampling #########################################
72
+ SCHEDULED_SAMPLING:
73
+ START_EPOCH: 0
74
+ INC_EVERY_EPOCH: 5
75
+ INC_PROB: 0.05
76
+ MAX_PROB: 0.25
77
+
78
+ DATALOADER:
79
+ USE_WEIGHTED_SAMPLER: True
80
+ UNIFIED_DATASET: True
81
+
82
+ ######################################### MODEL #########################################
83
+ MODEL:
84
+ TEMP_NAME: logit_scale_downstream
85
+ # VOCAB_SIZE: 49409 # include <BOS>/<EOS>
86
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
87
+ ENCODER: 'UnifiedBertEncoder'
88
+ # ENCODER_DIM: 512
89
+ # DECODER: 'UnifiedTransformerDecoder'
90
+ # DECODER_DIM: 512
91
+
92
+ BertParamsInit: True
93
+ # WEIGHTS: open_source_dataset/our_model/cc3m_encoder_decoder_warm1w_150k_retrivetask_gatherfeature_caption_mlm/model_Epoch_90000_Iter_0089999.pth
94
+
95
+ CLS_TOKEN: True
96
+ # PREDICTOR: 'BasePredictor'
97
+ # PRED_DROPOUT: 0.5
98
+ # MAX_SEQ_LEN: 20
99
+
100
+ # #################################### Token embedding ####################################
101
+ # TOKEN_EMBED:
102
+ # NAME: 'TokenBaseEmbedding'
103
+ # DIM: 512
104
+ # ACTIVATION: 'none'
105
+ # USE_NORM: True
106
+ # DROPOUT: 0.1
107
+ # POSITION: 'NNEmbeddingEncoding'
108
+ # POSITION_MAX_LEN: 512
109
+ # TYPE_VOCAB_SIZE: 2
110
+
111
+ # #################################### Visual embedding ####################################
112
+ # VISUAL_EMBED:
113
+ # NAME: 'VisualPatchEmbedding'
114
+ # IN_DIM: 3
115
+ # OUT_DIM: 512
116
+ # ACTIVATION: 'none'
117
+ # USE_NORM: True
118
+ # DROPOUT: 0.0
119
+ # PATCH_SIZE: 16
120
+
121
+ ####################################### BERT ############################################
122
+ BERT:
123
+ DROP_PATH_PROB: 0.05
124
+ # HIDDEN_SIZE: 512
125
+ HIDDEN_SIZE: 192
126
+ HIDDEN_DROPOUT_PROB: 0.
127
+ HIDDEN_ACT: "gelu"
128
+ NUM_ATTENTION_HEADS: 8
129
+ INTERMEDIATE_SIZE: 2048
130
+ INTERMEDIATE_DROP: 0.
131
+ FFN_DROPOUT_PROB: 0.
132
+ ATTENTION_PROBS_DROPOUT_PROB: 0.
133
+ NUM_HIDDEN_LAYERS: 6
134
+ NUM_GENERATION_LAYERS: 6
135
+
136
+ ####################################### Optimizer #######################################
137
+ SOLVER:
138
+ NAME: 'AdamW'
139
+ # EPOCH: 1
140
+ MAX_ITER: 30000
141
+ CHECKPOINT_PERIOD: 5000
142
+ CHECKPOINT_MAX_SAVE: 5
143
+ EVAL_PERIOD: 1000
144
+ BASE_LR: 0.00005
145
+ BIAS_LR_FACTOR: 1.0
146
+ WEIGHT_DECAY: 0.01
147
+ WEIGHT_DECAY_NORM: 0.0
148
+ WEIGHT_DECAY_BIAS: 0.0
149
+ MOMENTUM: 0.9
150
+ DAMPENING: 0.0
151
+ NESTEROV: 0.0
152
+ BETAS: [0.9, 0.999]
153
+ EPS: 1e-8
154
+ GRAD_CLIP: 5.0
155
+ GRAD_CLIP_TYPE: 'norm'
156
+ ACCUM_ITER: 0
157
+ AMP_FP16: True
158
+ APEX_FP16: False # dangerous
159
+
160
+ CHECKPOINT_MAPPING:
161
+ # -
162
+ # ORIGIN: cc3m_caption
163
+ # DEST: mscoco
164
+ -
165
+ ORIGIN: cc3m_retrieve
166
+ DEST: flickr30k
167
+
168
+ CHECKPOINT_MAP: True
169
+ ####################################### lr scheduler #######################################
170
+ LR_SCHEDULER:
171
+ NAME: 'WarmupCosine'
172
+ WARMUP: 1000
173
+ MIN_LR: 0.00000001
174
+
175
+ # ####################################### losses #######################################
176
+ # LOSSES:
177
+ # NAMES: ['LabelSmoothing']
178
+ # LABELSMOOTHING: 0.1
179
+
180
+ ####################################### decode strategy #######################################
181
+ # DECODE_STRATEGY:
182
+ # NAME: 'BeamSearcher'
183
+ # BEAM_SIZE: 2
184
+
185
+ ####################################### evaluation #######################################
186
+ INFERENCE:
187
+ VOCAB: 'CLIP'
188
+ ITER_BASED: True
189
+ find_unused_parameters: true
configs/BERT_L12_H384_experiments/base_model_bert_l12_h384.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ######################################### MODEL #########################################
3
+ MODEL:
4
+ VOCAB_SIZE: 49411 # include <BOS>/<EOS>
5
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
6
+ ENCODER: ''
7
+ ENCODER_DIM: 384
8
+ DECODER: ''
9
+ DECODER_DIM: 384
10
+
11
+ PREDICTOR: 'EmbedClsAsRetrievalPredictor'
12
+ FEATURE_GATHER: True
13
+ LEARN_TEMP: True
14
+ PRED_USE_NORM: True
15
+ PRED_TEMPERATURE: 0.07
16
+
17
+ BertParamsInit: True
18
+
19
+ CLS_TOKEN: False
20
+
21
+ QUEUE_LEN: 1024
22
+ MAX_LABEL_LEN: 12
23
+
24
+ OUTPUT_PROJ: True # output projection
25
+
26
+
27
+ # #################################### Token embedding ####################################
28
+ TOKEN_EMBED:
29
+ NAME: 'TokenBaseEmbedding'
30
+ DIM: 384
31
+ ACTIVATION: 'none'
32
+ USE_NORM: True
33
+ DROPOUT: 0.0
34
+ POSITION: 'NNEmbeddingEncoding'
35
+ POSITION_MAX_LEN: 512
36
+ TYPE_VOCAB_SIZE: 2
37
+
38
+ # #################################### Visual embedding ####################################
39
+ VISUAL_EMBED:
40
+ NAME: 'VisualPatchEmbedding'
41
+ IN_DIM: 3
42
+ OUT_DIM: 384
43
+ ACTIVATION: 'none'
44
+ USE_NORM: True
45
+ DROPOUT: 0.0
46
+ PATCH_SIZE: 16
47
+ TYPE_SIZE: 1 # image to encoder
48
+
49
+ # #################################### video embedding ####################################
50
+ VIDEO_EMBED:
51
+ NAME: 'VideoBaseEmbedding'
52
+ IN_DIM: 768
53
+ OUT_DIM: 384
54
+ ACTIVATION: 'none'
55
+ USE_NORM: True
56
+ DROPOUT: 0.0
57
+ TYPE_SIZE: 1 # video to encoder
58
+ POSITION: 'NNEmbeddingEncoding'
59
+ MAX_LENGTH: 1600
60
+ PATCH_SIZE_S: 16
61
+ PATCH_SIZE_T: 1
62
+ DIVIDE_ST_POS: True
63
+ USE_VISUAL_TOKENIZER: True
64
+ USE_VISUAL_POS: True
65
+ MAX_FRAMES: 8
66
+
67
+ ####################################### BERT ############################################
68
+ BERT:
69
+ DROP_PATH_PROB: 0.1
70
+ HIDDEN_SIZE: 384
71
+ HIDDEN_DROPOUT_PROB: 0.
72
+ HIDDEN_ACT: "gelu"
73
+ NUM_ATTENTION_HEADS: 6
74
+ INTERMEDIATE_SIZE: 1536
75
+ INTERMEDIATE_DROP: 0.
76
+ FFN_DROPOUT_PROB: 0.
77
+ ATTENTION_PROBS_DROPOUT_PROB: 0.
78
+ NUM_HIDDEN_LAYERS: 12
79
+ NUM_GENERATION_LAYERS: 0
80
+
configs/BERT_L12_H384_experiments/in1k_training.yaml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h384.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ # -
12
+ # NAME: 'Vocab_Word'
13
+ # SHARED_TARGETS_CFG:
14
+ # FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ # DISTRIBUTED: True
16
+
17
+ TASKS:
18
+
19
+ -
20
+ NAME: imagenet
21
+ DATASETS:
22
+ TRAIN: 'ImageNetDataset'
23
+ VAL: 'ImageNetDataset'
24
+ TASK_TYPE: 'image_classification'
25
+ DATASET_NAME: 'ImageNet1k'
26
+ TARGET_SET: ['ImageNet1k']
27
+
28
+ DATALOADER:
29
+ TRAIN_BATCH_SIZE: 32
30
+ TEST_BATCH_SIZE: 32
31
+ NUM_WORKERS: 4 # will be used as numworker for testing loader
32
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
33
+ S3_PATH: 'cluster2:s3://imagenet'
34
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
35
+ SAMPLING_WEIGHT: 1.0
36
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
37
+ MIXUP: 0.8
38
+ CUTMIX: 1.0
39
+ MIXUP_PROB: 1.0
40
+ MIXUP_SWITCH_PROB: 0.5
41
+ MIXUP_MODE: 'batch'
42
+ MIXUP_LABEL_SMOOTHING: 0.1
43
+ MODEL:
44
+ MAX_SEQ_LEN: -1
45
+ LABELS_NUM: 1000
46
+ TEMP_NAME: logit_scale_img_cls
47
+ LOSSES:
48
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
49
+ LOSS_WEIGHT: 1.0
50
+ REDUCTION: 'mean'
51
+ # LOSS_FP32: True
52
+ INFERENCE:
53
+ NAME: 'ImageNetEvaler'
54
+ ID_KEY: 'image_id'
55
+ VALUE: 'cls_logits'
56
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
57
+ TEST_ANNFILE: ''
58
+ GENERATION_MODE: False
59
+
60
+
61
+ ENGINE:
62
+ NAME: 'UnifiedTrainer'
63
+
64
+ MODEL:
65
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
66
+ ENCODER: 'UnifiedBertEncoder'
67
+
68
+ IN_TUNING: True # use IN1k instead of 22k
69
+ SHARE_LAYERNORM: True
70
+ BERT:
71
+ NORMALIZE_DECISION: "BERTPre"
72
+ DROP_PATH_PROB: 0.1
73
+ DROP_PATH_PROB_FIXED: True
74
+
75
+ UNIFY_QKV: True
76
+
77
+ MODEL_EMA: False
78
+ MODEL_EMA_DECAY: 0.9999
79
+
80
+ MAEParamsInit: True
81
+ POSEMBEDFIX: True
82
+
83
+
84
+ IMG_INPUT_SIZE: 224
85
+ PATCH_SIZE: 16
86
+
87
+ LAYER_SCALE: True
88
+ LAYER_SCALE_INIT: 1e-3
89
+
90
+
91
+ DATALOADER:
92
+ USE_WEIGHTED_SAMPLER: True
93
+ UNIFIED_DATASET: True
94
+ NUM_WORKERS: 16
95
+
96
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
97
+
98
+
99
+
100
+ ####################################### Optimizer #######################################
101
+ SOLVER:
102
+ NAME: 'Adam'
103
+ TORCH_OPTIMIZER: True
104
+ PARAMS_SEPERATE: True
105
+ # PARAMS_GROUP: True
106
+ # EPOCH: 1
107
+ MAX_ITER: 200000
108
+ CHECKPOINT_PERIOD: 10
109
+ EVAL_PERIOD: 500000
110
+ BASE_LR: 0.001
111
+ BIAS_LR_FACTOR: 1.0
112
+ WEIGHT_DECAY: 0.3
113
+ WEIGHT_DECAY_NORM: 0.0
114
+ WEIGHT_DECAY_BIAS: 0.0
115
+ WEIGHT_DECAY_EMBEDDING: 0.0
116
+ MOMENTUM: 0.9
117
+ DAMPENING: 0.0
118
+ NESTEROV: 0.0
119
+ BETAS: [0.9, 0.95]
120
+ EPS: 1e-6
121
+ GRAD_CLIP: 0.1
122
+ GRAD_CLIP_TYPE: 'norm'
123
+ ACCUM_ITER: 0
124
+ AMP_FP16: True
125
+ APEX_FP16: False # dangerous
126
+
127
+ WRITE_PERIOD: 50
128
+ MIN_LOSS_SCLE: 2048.0
129
+ # BF16: False # True
130
+ # ZEROSTAGE: 2
131
+
132
+ LOSS_SCALE_WINDOW: 200
133
+
134
+
135
+
136
+
137
+
138
+
139
+ ####################################### lr scheduler #######################################
140
+ LR_SCHEDULER:
141
+ NAME: 'WarmupCosine'
142
+ WARMUP: 20000
143
+ MIN_LR: 0.000001
144
+
145
+
146
+
147
+
148
+ ####################################### evaluation #######################################
149
+ INFERENCE:
150
+
151
+ VOCAB: 'CLIP'
152
+ ITER_BASED: True
153
+
154
+
155
+ find_unused_parameters: true
156
+
157
+ # ENCODERS:
158
+ # -
159
+ # NAME: VisualEncoder
160
+ # TYPE: VisualEncoder
161
+ # DROP_PATH_PROB: 0.0
162
+ # HIDDEN_SIZE: 192
163
+ # HIDDEN_DROPOUT_PROB: 0.
164
+ # HIDDEN_ACT: "gelu"
165
+ # NUM_ATTENTION_HEADS: 3
166
+ # INTERMEDIATE_SIZE: 768
167
+ # INTERMEDIATE_DROP: 0.
168
+ # FFN_DROPOUT_PROB: 0.
169
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
170
+ # NUM_HIDDEN_LAYERS: 6
171
+ # NUM_GENERATION_LAYERS: 0
172
+ # DROP_PATH_PROB_FIXED: True
173
+
174
+ # -
175
+ # NAME: TextEncoder
176
+ # TYPE: TextEncoder
177
+ # DROP_PATH_PROB: 0.0
178
+ # HIDDEN_SIZE: 192
179
+ # HIDDEN_DROPOUT_PROB: 0.
180
+ # HIDDEN_ACT: "gelu"
181
+ # NUM_ATTENTION_HEADS: 3
182
+ # INTERMEDIATE_SIZE: 768
183
+ # INTERMEDIATE_DROP: 0.
184
+ # FFN_DROPOUT_PROB: 0.
185
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
186
+ # NUM_HIDDEN_LAYERS: 6
187
+ # NUM_GENERATION_LAYERS: 0
188
+ # DROP_PATH_PROB_FIXED: True
189
+
configs/BERT_L12_H768_experiments/16tasks_training.yaml ADDED
@@ -0,0 +1,738 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet22k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ -
18
+ NAME: 'MomentsInTime'
19
+ SHARED_TARGETS_CFG:
20
+ FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
21
+ DISTRIBUTED: False
22
+
23
+ -
24
+ NAME: 'Kinetics700'
25
+ SHARED_TARGETS_CFG:
26
+ FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
27
+ DISTRIBUTED: False
28
+
29
+ TASKS:
30
+
31
+ -
32
+ NAME: imagenet22k
33
+ DATASETS:
34
+ TRAIN: 'ImageNet22KDataset'
35
+ TASK_TYPE: 'image_classification'
36
+ DATASET_NAME: 'ImageNet22k'
37
+ TARGET_SET: ['ImageNet22k']
38
+
39
+ DATALOADER:
40
+ TRAIN_BATCH_SIZE: 720
41
+ # TEST_BATCH_SIZE: 2
42
+ NUM_WORKERS: 2
43
+ FEATS_FOLDER: 'open_source_dataset/imagenet22k'
44
+ S3_PATH: 'cluster2:s3://imagenet22k'
45
+ ANNO_FOLDER: 'open_source_dataset/'
46
+ SAMPLING_WEIGHT: 2.486
47
+ MIXUP: 0.8
48
+ CUTMIX: 1.0
49
+ MIXUP_PROB: 1.0
50
+ MIXUP_SWITCH_PROB: 0.5
51
+ MIXUP_MODE: 'batch'
52
+ MIXUP_LABEL_SMOOTHING: 0.1
53
+ MODEL:
54
+ MAX_SEQ_LEN: -1
55
+ LABELS_NUM: 21842
56
+ TEMP_NAME: logit_scale_img_cls
57
+ LOSSES:
58
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
59
+ LOSS_WEIGHT: 1.0
60
+ REDUCTION: 'mean'
61
+
62
+ -
63
+ NAME: K700_retrieve
64
+ DATASETS:
65
+ TRAIN: 'VideoDataSet'
66
+ TASK_TYPE: 'video_classification'
67
+ DATASET_NAME: 'K700'
68
+ TARGET_SET: ['Kinetics700']
69
+ DATALOADER:
70
+ TRAIN_BATCH_SIZE: 64
71
+ TEST_BATCH_SIZE: 24
72
+ NUM_WORKERS: 2
73
+ FEATS_FOLDER: 'open_source_dataset/K700'
74
+ ANNO_FOLDER: 'open_source_dataset/K700'
75
+ S3_PATH: 's3://K700/'
76
+ FRAMES_PER_CLIP: 4
77
+ STRIDE: 32
78
+ FILE_EXTENSION: ''
79
+ ANNO_FILE: 'annotation.json'
80
+ TIMESFORMER_AUG: True
81
+ SAMPLING_WEIGHT: 0.76
82
+
83
+ MODEL:
84
+ MAX_SEQ_LEN: -1
85
+ TEMP_NAME: logit_scale_video_cls
86
+ LOSSES:
87
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
88
+ LABELSMOOTHING: 0.1
89
+ LOSS_WEIGHT: 0.1
90
+ INFERENCE:
91
+ VOCAB: 'CLIP'
92
+ GENERATION_MODE: False
93
+
94
+ -
95
+ NAME: MomentsInTime
96
+ DATASETS:
97
+ TRAIN: 'VideoDataSet'
98
+ TASK_TYPE: 'video_classification'
99
+ DATASET_NAME: 'MiT'
100
+ TARGET_SET: ['MomentsInTime']
101
+ DATALOADER:
102
+ TRAIN_BATCH_SIZE: 112
103
+ TEST_BATCH_SIZE: 8
104
+ NUM_WORKERS: 2
105
+ FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
106
+ ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
107
+ S3_PATH: 's3://MomentsInTime/'
108
+ FRAMES_PER_CLIP: 3
109
+ STRIDE: 32
110
+ FILE_EXTENSION: ''
111
+ ANNO_FILE: 'annotation.json'
112
+ TIMESFORMER_AUG: True
113
+ SAMPLING_WEIGHT: 0.44
114
+
115
+ MODEL:
116
+ MAX_SEQ_LEN: -1
117
+ TEMP_NAME: logit_scale_video_cls
118
+ LOSSES:
119
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
120
+ LABELSMOOTHING: 0.1
121
+ LOSS_WEIGHT: 0.1
122
+ INFERENCE:
123
+ NAME: 'MiTEvaler'
124
+ ID_KEY: 'video_name'
125
+ VALUE: 'label'
126
+ VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
127
+ TEST_ANNFILE: ''
128
+ GENERATION_MODE: False
129
+ NUM_VIEWS: 1
130
+
131
+ -
132
+ NAME: bookswiki_pretrain
133
+ DATASETS:
134
+ TRAIN: 'GeneralCorpusDataset'
135
+ TASK_TYPE: 'text_mlm'
136
+ DATASET_NAME: 'BooksWiki'
137
+ TARGET_SET: ['Vocab_Word']
138
+ VERSION: 'v2'
139
+ DATALOADER:
140
+ TRAIN_BATCH_SIZE: 512
141
+ TEST_BATCH_SIZE: 32
142
+ NUM_WORKERS: 2
143
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
144
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
145
+ SEQ_PER_SAMPLE: 1
146
+ SAMPLER: NodeDistributed
147
+ CACHE_MODE: True
148
+ SEQ_PER_SAMPLE: 128
149
+ MIN_SEQ_PER_SAMPLE: 128
150
+ APPEND_EOS: True
151
+ ONE_STREAM: False
152
+ SAMPLING_WEIGHT: 2.75
153
+ RANDOM_MASK: True
154
+ MODEL:
155
+ MAX_SEQ_LEN: 128
156
+ TEMP_NAME: logit_scale_text_mlm
157
+ LOSSES:
158
+ NAMES: ['CrossEntropy', 'Accuracy']
159
+ LOSS_WEIGHT: 0.5
160
+ REDUCTION: 'mean'
161
+ INFERENCE:
162
+ VOCAB: 'CLIP'
163
+ GENERATION_MODE: False
164
+
165
+
166
+ -
167
+ NAME: yfcc_caption
168
+ DATASETS:
169
+ TRAIN: 'ImageTextPairDataset'
170
+ TASK_TYPE: 'image_caption'
171
+ DATASET_NAME: 'YFCC'
172
+ TARGET_SET: ['Vocab_Word']
173
+ DATALOADER:
174
+ TRAIN_BATCH_SIZE: 300
175
+ TEST_BATCH_SIZE: 32
176
+ NUM_WORKERS: 2
177
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
178
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
179
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
180
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
181
+ S3_PATH: 'cluster2:s3://yfcc/'
182
+ SEQ_PER_SAMPLE: 1
183
+ SAMPLER: NodeDistributed
184
+ CACHE_MODE: True
185
+ CIRCULAR_CACHE_MODE: False
186
+ ZIP_MODE: False
187
+ CACHE_ORIGIN_IMAGE: False
188
+ RANDOM_CAPTION: True
189
+ AS_NUMPY_AS_POSSIBLE: False
190
+ SAMPLING_WEIGHT: 0.5840
191
+ TRANSFORM: 'clip_transforms'
192
+ MODEL:
193
+ MAX_SEQ_LEN: 50
194
+ TEMP_NAME: logit_scale_caption
195
+ LOSSES:
196
+ NAMES: ['CrossEntropy', 'Accuracy']
197
+ LOSS_WEIGHT: 1.0
198
+ REDUCTION: 'mean'
199
+ INFERENCE:
200
+ VOCAB: 'CLIP'
201
+ GENERATION_MODE: False
202
+
203
+ -
204
+ NAME: cc12m_caption
205
+ DATASETS:
206
+ TRAIN: 'ImageTextPairDataset'
207
+ TASK_TYPE: 'image_caption'
208
+ DATASET_NAME: 'CC12M'
209
+ TARGET_SET: ['Vocab_Word']
210
+ DATALOADER:
211
+ TRAIN_BATCH_SIZE: 300
212
+ TEST_BATCH_SIZE: 32
213
+ NUM_WORKERS: 2
214
+ S3_ANNO_FOLDER: 's3://cc12m/'
215
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
216
+ ANNO_FILENAME: 'train_available.json'
217
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
218
+ S3_PATH: 's3://cc12m/'
219
+ SEQ_PER_SAMPLE: 1
220
+ SAMPLER: NodeDistributed
221
+ CACHE_MODE: True
222
+ CIRCULAR_CACHE_MODE: False
223
+ ZIP_MODE: False
224
+ CACHE_ORIGIN_IMAGE: False
225
+ RANDOM_CAPTION: False
226
+ AS_NUMPY_AS_POSSIBLE: False
227
+ SAMPLING_WEIGHT: 0.5057
228
+ TRANSFORM: 'clip_transforms'
229
+ MODEL:
230
+ MAX_SEQ_LEN: 50
231
+ TEMP_NAME: logit_scale_caption
232
+ LOSSES:
233
+ NAMES: ['CrossEntropy', 'Accuracy']
234
+ LOSS_WEIGHT: 1.0
235
+ REDUCTION: 'mean'
236
+ INFERENCE:
237
+ VOCAB: 'CLIP'
238
+ GENERATION_MODE: False
239
+
240
+ -
241
+ NAME: cc3m_caption
242
+ DATASETS:
243
+ TRAIN: 'ImageTextPairDataset'
244
+ TASK_TYPE: 'image_caption'
245
+ DATASET_NAME: 'CC3M'
246
+ TARGET_SET: ['Vocab_Word']
247
+ DATALOADER:
248
+ TRAIN_BATCH_SIZE: 300
249
+ TEST_BATCH_SIZE: 32
250
+ NUM_WORKERS: 2
251
+ S3_ANNO_FOLDER: 's3://cc3m/'
252
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
253
+ ANNO_FILENAME: 'train_spacy.json'
254
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
255
+ S3_PATH: 's3://cc3m/'
256
+ SEQ_PER_SAMPLE: 1
257
+ SAMPLER: NodeDistributed
258
+ CACHE_MODE: True
259
+ CIRCULAR_CACHE_MODE: False
260
+ ZIP_MODE: False
261
+ CACHE_ORIGIN_IMAGE: False
262
+ RANDOM_CAPTION: False
263
+ AS_NUMPY_AS_POSSIBLE: False
264
+ SAMPLING_WEIGHT: 0.26295
265
+ TRANSFORM: 'clip_transforms'
266
+ MODEL:
267
+ MAX_SEQ_LEN: 50
268
+ TEMP_NAME: logit_scale_caption
269
+ LOSSES:
270
+ NAMES: ['CrossEntropy', 'Accuracy']
271
+ LOSS_WEIGHT: 1.0
272
+ REDUCTION: 'mean'
273
+ INFERENCE:
274
+ VOCAB: 'CLIP'
275
+ GENERATION_MODE: False
276
+
277
+ -
278
+ NAME: vg_caption
279
+ DATASETS:
280
+ TRAIN: 'ImageTextPairDataset'
281
+ TASK_TYPE: 'image_caption'
282
+ DATASET_NAME: 'VG'
283
+ TARGET_SET: ['Vocab_Word']
284
+ DATALOADER:
285
+ TRAIN_BATCH_SIZE: 300
286
+ TEST_BATCH_SIZE: 32
287
+ NUM_WORKERS: 2
288
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
289
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
290
+ S3_PATH: 's3://visual_genome/images'
291
+ ANNO_FILENAME: 'vg_captions_128filter.json'
292
+ SEQ_PER_SAMPLE: 1
293
+ CACHE_MODE: True
294
+ CIRCULAR_CACHE_MODE: False
295
+ ZIP_MODE: False
296
+ CACHE_ORIGIN_IMAGE: False
297
+ RANDOM_CAPTION: False
298
+ AS_NUMPY_AS_POSSIBLE: False
299
+ SAMPLING_WEIGHT: 0.1766
300
+ TRANSFORM: 'clip_transforms'
301
+ MODEL:
302
+ MAX_SEQ_LEN: 30
303
+ TEMP_NAME: logit_scale_caption
304
+ LOSSES:
305
+ NAMES: ['CrossEntropy', 'Accuracy']
306
+ LOSS_WEIGHT: 1.0
307
+ REDUCTION: 'mean'
308
+ INFERENCE:
309
+ VOCAB: 'CLIP'
310
+ GENERATION_MODE: True
311
+
312
+
313
+ -
314
+ NAME: mscoco_caption
315
+ DATASETS:
316
+ TRAIN: 'ImageTextPairDataset'
317
+ # VAL: 'ImageTextPairDataset'
318
+ # TEST: 'ImageTextPairDataset'
319
+ TASK_TYPE: 'image_caption'
320
+ DATASET_NAME: 'MSCOCO'
321
+ TARGET_SET: ['Vocab_Word']
322
+ DATALOADER:
323
+ TRAIN_BATCH_SIZE: 300
324
+ TEST_BATCH_SIZE: 32
325
+ NUM_WORKERS: 1
326
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
327
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
328
+ S3_PATH: 's3://coco/'
329
+ SEQ_PER_SAMPLE: 1
330
+ CACHE_MODE: True
331
+ CIRCULAR_CACHE_MODE: False
332
+ ZIP_MODE: False
333
+ CACHE_ORIGIN_IMAGE: False
334
+ RANDOM_CAPTION: False
335
+ AS_NUMPY_AS_POSSIBLE: False
336
+ SAMPLING_WEIGHT: 0.1144
337
+ TRANSFORM: 'clip_transforms'
338
+ RANDOM_MASK: True
339
+ MODEL:
340
+ MAX_SEQ_LEN: 50
341
+ EVAL_MAX_SEQ_LEN: 21
342
+ TEMP_NAME: logit_scale_caption
343
+ LOSSES:
344
+ NAMES: ['CrossEntropy', 'Accuracy']
345
+ LOSS_WEIGHT: 1.0
346
+ REDUCTION: 'mean'
347
+ DECODE_STRATEGY:
348
+ NAME: 'CaptionBeamSearcherV3'
349
+ BEAM_SIZE: 2
350
+ # LEN_PENALTY: 1.0
351
+ INFERENCE:
352
+ NAME: 'COCOEvaler'
353
+ VOCAB: 'CLIP'
354
+ ID_KEY: 'image_id'
355
+ VALUE: 'caption'
356
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
357
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
358
+ GENERATION_MODE: True
359
+
360
+ -
361
+ NAME: sbu_caption
362
+ DATASETS:
363
+ TRAIN: 'ImageTextPairDataset'
364
+ TASK_TYPE: 'image_caption'
365
+ DATASET_NAME: 'SBU'
366
+ TARGET_SET: ['Vocab_Word']
367
+ DATALOADER:
368
+ TRAIN_BATCH_SIZE: 300
369
+ TEST_BATCH_SIZE: 32
370
+ NUM_WORKERS: 1
371
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
372
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
373
+ ANNO_FILENAME: 'subcaption.json'
374
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
375
+ S3_PATH: 's3://SBU/images'
376
+ SEQ_PER_SAMPLE: 1
377
+ SAMPLER: NodeDistributed
378
+ CACHE_MODE: True
379
+ CIRCULAR_CACHE_MODE: False
380
+ ZIP_MODE: False
381
+ CACHE_ORIGIN_IMAGE: False
382
+ RANDOM_CAPTION: False
383
+ AS_NUMPY_AS_POSSIBLE: False
384
+ SAMPLING_WEIGHT: 0.1383
385
+ TRANSFORM: 'clip_transforms'
386
+ MODEL:
387
+ MAX_SEQ_LEN: 50
388
+ TEMP_NAME: logit_scale_caption
389
+ LOSSES:
390
+ NAMES: ['CrossEntropy', 'Accuracy']
391
+ LOSS_WEIGHT: 1.0
392
+ REDUCTION: 'mean'
393
+ INFERENCE:
394
+ VOCAB: 'CLIP'
395
+ GENERATION_MODE: False
396
+
397
+ -
398
+ NAME: yfcc_retrieve
399
+ DATASETS:
400
+ TRAIN: 'ImageTextPairDataset'
401
+ TASK_TYPE: 'image_retrieval'
402
+ DATASET_NAME: 'YFCC'
403
+ DATALOADER:
404
+ TRAIN_BATCH_SIZE: 512
405
+ TEST_BATCH_SIZE: 32
406
+ NUM_WORKERS: 2
407
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
408
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
409
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
410
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
411
+ S3_PATH: 'cluster2:s3://yfcc/'
412
+ SAMPLER: NodeDistributed
413
+ CACHE_MODE: True
414
+ CIRCULAR_CACHE_MODE: False
415
+ ZIP_MODE: False
416
+ CACHE_ORIGIN_IMAGE: False
417
+ RANDOM_CAPTION: True
418
+ AS_NUMPY_AS_POSSIBLE: False
419
+ SAMPLING_WEIGHT: 0.5840
420
+ TRANSFORM: 'clip_transforms'
421
+ MODEL:
422
+ MAX_SEQ_LEN: 50
423
+ TEMP_NAME: logit_scale_retrieve
424
+ LOSSES:
425
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
426
+ LABELSMOOTHING: 0.1
427
+ LOSS_WEIGHT: 0.5
428
+ REDUCTION: 'mean'
429
+ INFERENCE:
430
+ VOCAB: 'CLIP'
431
+ GENERATION_MODE: False
432
+
433
+ -
434
+ NAME: cc12m_retrieve
435
+ DATASETS:
436
+ TRAIN: 'ImageTextPairDataset'
437
+ TASK_TYPE: 'image_retrieval'
438
+ DATASET_NAME: 'CC12M'
439
+ DATALOADER:
440
+ TRAIN_BATCH_SIZE: 512
441
+ TEST_BATCH_SIZE: 32
442
+ NUM_WORKERS: 2
443
+ S3_ANNO_FOLDER: 's3://cc12m/'
444
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
445
+ ANNO_FILENAME: 'train_available.json'
446
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
447
+ S3_PATH: 's3://cc12m/'
448
+ SAMPLER: NodeDistributed
449
+ CACHE_MODE: True
450
+ CIRCULAR_CACHE_MODE: False
451
+ ZIP_MODE: False
452
+ CACHE_ORIGIN_IMAGE: False
453
+ RANDOM_CAPTION: False
454
+ AS_NUMPY_AS_POSSIBLE: False
455
+ SAMPLING_WEIGHT: 0.5057
456
+ TRANSFORM: 'clip_transforms'
457
+ MODEL:
458
+ MAX_SEQ_LEN: 50
459
+ TEMP_NAME: logit_scale_retrieve
460
+ LOSSES:
461
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
462
+ LABELSMOOTHING: 0.1
463
+ LOSS_WEIGHT: 0.5
464
+ REDUCTION: 'mean'
465
+ INFERENCE:
466
+ VOCAB: 'CLIP'
467
+ GENERATION_MODE: False
468
+
469
+ -
470
+ NAME: cc3m_retrieve
471
+ DATASETS:
472
+ TRAIN: 'ImageTextPairDataset'
473
+ TASK_TYPE: 'image_retrieval'
474
+ DATASET_NAME: 'CC3M'
475
+ DATALOADER:
476
+ TRAIN_BATCH_SIZE: 512
477
+ TEST_BATCH_SIZE: 32
478
+ NUM_WORKERS: 2
479
+ S3_ANNO_FOLDER: 's3://cc3m/'
480
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
481
+ ANNO_FILENAME: 'train_spacy.json'
482
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
483
+ S3_PATH: 's3://cc3m/'
484
+ SAMPLER: NodeDistributed
485
+ CACHE_MODE: True
486
+ CIRCULAR_CACHE_MODE: False
487
+ ZIP_MODE: False
488
+ CACHE_ORIGIN_IMAGE: False
489
+ RANDOM_CAPTION: False
490
+ AS_NUMPY_AS_POSSIBLE: False
491
+ SAMPLING_WEIGHT: 0.26295
492
+ TRANSFORM: 'clip_transforms'
493
+ MODEL:
494
+ MAX_SEQ_LEN: 50
495
+ TEMP_NAME: logit_scale_retrieve
496
+ LOSSES:
497
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
498
+ LABELSMOOTHING: 0.1
499
+ LOSS_WEIGHT: 0.5
500
+ REDUCTION: 'mean'
501
+ INFERENCE:
502
+ VOCAB: 'CLIP'
503
+ GENERATION_MODE: False
504
+
505
+ -
506
+ NAME: vg_retrieve
507
+ DATASETS:
508
+ TRAIN: 'ImageTextPairDataset'
509
+ TASK_TYPE: 'image_retrieval'
510
+ DATASET_NAME: 'VG'
511
+ DATALOADER:
512
+ TRAIN_BATCH_SIZE: 512
513
+ TEST_BATCH_SIZE: 32
514
+ NUM_WORKERS: 2
515
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
516
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
517
+ S3_PATH: 's3://visual_genome/images'
518
+ ANNO_FILENAME: 'vg_captions_128filter.json'
519
+ SEQ_PER_SAMPLE: 1
520
+ CACHE_MODE: True
521
+ CIRCULAR_CACHE_MODE: False
522
+ ZIP_MODE: False
523
+ CACHE_ORIGIN_IMAGE: False
524
+ RANDOM_CAPTION: False
525
+ AS_NUMPY_AS_POSSIBLE: False
526
+ SAMPLING_WEIGHT: 0.1766
527
+ TRANSFORM: 'clip_transforms'
528
+ MODEL:
529
+ MAX_SEQ_LEN: 30
530
+ TEMP_NAME: logit_scale_retrieve
531
+ LOSSES:
532
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
533
+ LABELSMOOTHING: 0.1
534
+ LOSS_WEIGHT: 0.5
535
+ REDUCTION: 'mean'
536
+ INFERENCE:
537
+ VOCAB: 'CLIP'
538
+ GENERATION_MODE: False
539
+
540
+ -
541
+ NAME: mscoco_retrieve
542
+ DATASETS:
543
+ TRAIN: 'ImageTextPairDataset'
544
+ # TEST: 'ImageTextPairDataset'
545
+ TASK_TYPE: 'image_retrieval'
546
+ DATASET_NAME: 'MSCOCO'
547
+ DATALOADER:
548
+ TRAIN_BATCH_SIZE: 512
549
+ TEST_BATCH_SIZE: 32
550
+ NUM_WORKERS: 1
551
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
552
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
553
+ S3_PATH: 's3://coco/'
554
+ SEQ_PER_SAMPLE: 1
555
+ CACHE_MODE: True
556
+ CIRCULAR_CACHE_MODE: False
557
+ ZIP_MODE: False
558
+ CACHE_ORIGIN_IMAGE: False
559
+ RANDOM_CAPTION: False
560
+ AS_NUMPY_AS_POSSIBLE: False
561
+ SAMPLING_WEIGHT: 0.1144
562
+ TRANSFORM: 'clip_transforms'
563
+ MODEL:
564
+ MAX_SEQ_LEN: 50
565
+ TEMP_NAME: logit_scale_retrieve
566
+ LOSSES:
567
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
568
+ LABELSMOOTHING: 0.1
569
+ LOSS_WEIGHT: 0.5
570
+ REDUCTION: 'mean'
571
+ INFERENCE:
572
+ VOCAB: 'CLIP'
573
+ ID_KEY: 'image_id'
574
+ VALUE: 'caption'
575
+ NAME: 'RetrievalEvaler'
576
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
577
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
578
+ GENERATION_MODE: False
579
+
580
+ -
581
+ NAME: sbu_retrieve
582
+ DATASETS:
583
+ TRAIN: 'ImageTextPairDataset'
584
+ TASK_TYPE: 'image_retrieval'
585
+ DATASET_NAME: 'SBU'
586
+ DATALOADER:
587
+ TRAIN_BATCH_SIZE: 512
588
+ TEST_BATCH_SIZE: 32
589
+ NUM_WORKERS: 1
590
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
591
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
592
+ ANNO_FILENAME: 'subcaption.json'
593
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
594
+ S3_PATH: 's3://SBU/images'
595
+ SAMPLER: NodeDistributed
596
+ CACHE_MODE: True
597
+ CIRCULAR_CACHE_MODE: False
598
+ ZIP_MODE: False
599
+ CACHE_ORIGIN_IMAGE: False
600
+ RANDOM_CAPTION: False
601
+ AS_NUMPY_AS_POSSIBLE: False
602
+ SAMPLING_WEIGHT: 0.1383
603
+ TRANSFORM: 'clip_transforms'
604
+ MODEL:
605
+ MAX_SEQ_LEN: 50
606
+ TEMP_NAME: logit_scale_retrieve
607
+ LOSSES:
608
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
609
+ LABELSMOOTHING: 0.1
610
+ LOSS_WEIGHT: 0.5
611
+ REDUCTION: 'mean'
612
+ INFERENCE:
613
+ VOCAB: 'CLIP'
614
+ GENERATION_MODE: False
615
+
616
+
617
+ ENGINE:
618
+ NAME: 'UnifiedTrainer'
619
+
620
+ MODEL:
621
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
622
+ ENCODER: 'UnifiedBertEncoder'
623
+
624
+
625
+ SHARE_LAYERNORM: True
626
+ BERT:
627
+ NORMALIZE_DECISION: "BERTPre"
628
+ DROP_PATH_PROB: 0.1
629
+ DROP_PATH_PROB_FIXED: True
630
+
631
+ UNIFY_QKV: True
632
+
633
+ MODEL_EMA: False
634
+ MODEL_EMA_DECAY: 0.9999
635
+
636
+ MAEParamsInit: True
637
+ POSEMBEDFIX: True
638
+
639
+
640
+ IMG_INPUT_SIZE: 160
641
+ PATCH_SIZE: 16
642
+
643
+ LAYER_SCALE: True
644
+ LAYER_SCALE_INIT: 1e-3
645
+
646
+
647
+ DATALOADER:
648
+ USE_WEIGHTED_SAMPLER: True
649
+ UNIFIED_DATASET: True
650
+ NUM_WORKERS: 32
651
+
652
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
653
+
654
+
655
+
656
+ ####################################### Optimizer #######################################
657
+ SOLVER:
658
+ NAME: 'Adam'
659
+ TORCH_OPTIMIZER: True
660
+ PARAMS_SEPERATE: True
661
+ # PARAMS_GROUP: True
662
+ # EPOCH: 1
663
+ MAX_ITER: 400000
664
+ CHECKPOINT_PERIOD: 5000
665
+ EVAL_PERIOD: 10000000
666
+ BASE_LR: 0.001
667
+ BIAS_LR_FACTOR: 1.0
668
+ WEIGHT_DECAY: 0.2
669
+ WEIGHT_DECAY_NORM: 0.0
670
+ WEIGHT_DECAY_BIAS: 0.0
671
+ WEIGHT_DECAY_EMBEDDING: 0.0
672
+ MOMENTUM: 0.9
673
+ DAMPENING: 0.0
674
+ NESTEROV: 0.0
675
+ BETAS: [0.9, 0.95]
676
+ EPS: 1e-6
677
+ GRAD_CLIP: 0.1
678
+ GRAD_CLIP_TYPE: 'norm'
679
+ ACCUM_ITER: 0
680
+ AMP_FP16: True
681
+ APEX_FP16: False # dangerous
682
+
683
+ WRITE_PERIOD: 50
684
+ MIN_LOSS_SCLE: 2048.0
685
+ # BF16: False # True
686
+ # ZEROSTAGE: 2
687
+
688
+ LOSS_SCALE_WINDOW: 200
689
+
690
+
691
+ ####################################### lr scheduler #######################################
692
+ LR_SCHEDULER:
693
+ NAME: 'WarmupCosine'
694
+ WARMUP: 10000
695
+ MIN_LR: 0.000001
696
+
697
+ ####################################### evaluation #######################################
698
+ INFERENCE:
699
+
700
+ VOCAB: 'CLIP'
701
+ ITER_BASED: True
702
+
703
+
704
+ find_unused_parameters: true
705
+
706
+ # ENCODERS:
707
+ # -
708
+ # NAME: VisualEncoder
709
+ # TYPE: VisualEncoder
710
+ # DROP_PATH_PROB: 0.0
711
+ # HIDDEN_SIZE: 192
712
+ # HIDDEN_DROPOUT_PROB: 0.
713
+ # HIDDEN_ACT: "gelu"
714
+ # NUM_ATTENTION_HEADS: 3
715
+ # INTERMEDIATE_SIZE: 768
716
+ # INTERMEDIATE_DROP: 0.
717
+ # FFN_DROPOUT_PROB: 0.
718
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
719
+ # NUM_HIDDEN_LAYERS: 6
720
+ # NUM_GENERATION_LAYERS: 0
721
+ # DROP_PATH_PROB_FIXED: True
722
+
723
+ # -
724
+ # NAME: TextEncoder
725
+ # TYPE: TextEncoder
726
+ # DROP_PATH_PROB: 0.0
727
+ # HIDDEN_SIZE: 192
728
+ # HIDDEN_DROPOUT_PROB: 0.
729
+ # HIDDEN_ACT: "gelu"
730
+ # NUM_ATTENTION_HEADS: 3
731
+ # INTERMEDIATE_SIZE: 768
732
+ # INTERMEDIATE_DROP: 0.
733
+ # FFN_DROPOUT_PROB: 0.
734
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
735
+ # NUM_HIDDEN_LAYERS: 6
736
+ # NUM_GENERATION_LAYERS: 0
737
+ # DROP_PATH_PROB_FIXED: True
738
+
configs/BERT_L12_H768_experiments/16tasks_training_apex_o2.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "16tasks_training.yaml"
2
+
3
+ ####################################### Optimizer #######################################
4
+ SOLVER:
5
+
6
+ AMP_FP16: False
7
+ APEX_FP16: True # dangerous
8
+ APEX_OPT_LEVEL: 'O2'
9
+ MIN_LOSS_SCLE: 128.0
10
+ CHECKPOINT_PERIOD: 10000
11
+
configs/BERT_L12_H768_experiments/16tasks_training_basedense_stage1_64gpu.yaml ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet22k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ -
18
+ NAME: 'MomentsInTime'
19
+ SHARED_TARGETS_CFG:
20
+ FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
21
+ DISTRIBUTED: False
22
+
23
+ -
24
+ NAME: 'Kinetics700'
25
+ SHARED_TARGETS_CFG:
26
+ FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
27
+ DISTRIBUTED: False
28
+
29
+ TASKS:
30
+
31
+ -
32
+ NAME: imagenet22k
33
+ DATASETS:
34
+ TRAIN: 'ImageNet22KDataset'
35
+ TASK_TYPE: 'image_classification'
36
+ DATASET_NAME: 'ImageNet22k'
37
+ TARGET_SET: ['ImageNet22k']
38
+
39
+ DATALOADER:
40
+ TRAIN_BATCH_SIZE: 720
41
+ # TEST_BATCH_SIZE: 2
42
+ NUM_WORKERS: 2
43
+ FEATS_FOLDER: 'open_source_dataset/imagenet22k'
44
+ S3_PATH: 'cluster2:s3://imagenet22k'
45
+ ANNO_FOLDER: 'open_source_dataset/'
46
+ SAMPLING_WEIGHT: 2.486
47
+ MIXUP: 0.8
48
+ CUTMIX: 1.0
49
+ MIXUP_PROB: 1.0
50
+ MIXUP_SWITCH_PROB: 0.5
51
+ MIXUP_MODE: 'batch'
52
+ MIXUP_LABEL_SMOOTHING: 0.1
53
+ MODEL:
54
+ MAX_SEQ_LEN: -1
55
+ LABELS_NUM: 21842
56
+ TEMP_NAME: logit_scale_img_cls
57
+ LOSSES:
58
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
59
+ LOSS_WEIGHT: 1.0
60
+ REDUCTION: 'mean'
61
+
62
+ -
63
+ NAME: K700_retrieve
64
+ DATASETS:
65
+ TRAIN: 'VideoDataSet'
66
+ TASK_TYPE: 'video_classification'
67
+ DATASET_NAME: 'K700'
68
+ TARGET_SET: ['Kinetics700']
69
+ DATALOADER:
70
+ TRAIN_BATCH_SIZE: 64
71
+ TEST_BATCH_SIZE: 24
72
+ NUM_WORKERS: 2
73
+ FEATS_FOLDER: 'open_source_dataset/K700'
74
+ ANNO_FOLDER: 'open_source_dataset/K700'
75
+ S3_PATH: 's3://K700/'
76
+ FRAMES_PER_CLIP: 4
77
+ STRIDE: 32
78
+ FILE_EXTENSION: ''
79
+ ANNO_FILE: 'annotation.json'
80
+ TIMESFORMER_AUG: True
81
+ SAMPLING_WEIGHT: 0.76
82
+
83
+ MODEL:
84
+ MAX_SEQ_LEN: -1
85
+ TEMP_NAME: logit_scale_video_cls
86
+ LOSSES:
87
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
88
+ LABELSMOOTHING: 0.1
89
+ LOSS_WEIGHT: 0.1
90
+ INFERENCE:
91
+ VOCAB: 'CLIP'
92
+ GENERATION_MODE: False
93
+
94
+ -
95
+ NAME: MomentsInTime
96
+ DATASETS:
97
+ TRAIN: 'VideoDataSet'
98
+ TASK_TYPE: 'video_classification'
99
+ DATASET_NAME: 'MiT'
100
+ TARGET_SET: ['MomentsInTime']
101
+ DATALOADER:
102
+ TRAIN_BATCH_SIZE: 112
103
+ TEST_BATCH_SIZE: 8
104
+ NUM_WORKERS: 2
105
+ FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
106
+ ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
107
+ S3_PATH: 's3://MomentsInTime/'
108
+ FRAMES_PER_CLIP: 3
109
+ STRIDE: 32
110
+ FILE_EXTENSION: ''
111
+ ANNO_FILE: 'annotation.json'
112
+ TIMESFORMER_AUG: True
113
+ SAMPLING_WEIGHT: 0.44
114
+
115
+ MODEL:
116
+ MAX_SEQ_LEN: -1
117
+ TEMP_NAME: logit_scale_video_cls
118
+ LOSSES:
119
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
120
+ LABELSMOOTHING: 0.1
121
+ LOSS_WEIGHT: 0.1
122
+ INFERENCE:
123
+ NAME: 'MiTEvaler'
124
+ ID_KEY: 'video_name'
125
+ VALUE: 'label'
126
+ VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
127
+ TEST_ANNFILE: ''
128
+ GENERATION_MODE: False
129
+ NUM_VIEWS: 1
130
+
131
+ -
132
+ NAME: bookswiki_pretrain
133
+ DATASETS:
134
+ TRAIN: 'GeneralCorpusDataset'
135
+ TASK_TYPE: 'text_mlm'
136
+ DATASET_NAME: 'BooksWiki'
137
+ TARGET_SET: ['Vocab_Word']
138
+ VERSION: 'v2'
139
+ DATALOADER:
140
+ TRAIN_BATCH_SIZE: 512
141
+ TEST_BATCH_SIZE: 32
142
+ NUM_WORKERS: 2
143
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
144
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
145
+ SEQ_PER_SAMPLE: 1
146
+ SAMPLER: NodeDistributed
147
+ CACHE_MODE: True
148
+ SEQ_PER_SAMPLE: 128
149
+ MIN_SEQ_PER_SAMPLE: 128
150
+ APPEND_EOS: True
151
+ ONE_STREAM: False
152
+ SAMPLING_WEIGHT: 2.75
153
+ RANDOM_MASK: True
154
+ MODEL:
155
+ MAX_SEQ_LEN: 128
156
+ TEMP_NAME: logit_scale_text_mlm
157
+ LOSSES:
158
+ NAMES: ['CrossEntropy', 'Accuracy']
159
+ LOSS_WEIGHT: 0.5
160
+ REDUCTION: 'mean'
161
+ INFERENCE:
162
+ VOCAB: 'CLIP'
163
+ GENERATION_MODE: False
164
+
165
+
166
+ -
167
+ NAME: yfcc_caption
168
+ DATASETS:
169
+ TRAIN: 'ImageTextPairDataset'
170
+ TASK_TYPE: 'image_caption'
171
+ DATASET_NAME: 'YFCC'
172
+ TARGET_SET: ['Vocab_Word']
173
+ DATALOADER:
174
+ TRAIN_BATCH_SIZE: 300
175
+ TEST_BATCH_SIZE: 32
176
+ NUM_WORKERS: 2
177
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
178
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
179
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
180
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
181
+ S3_PATH: 'cluster2:s3://yfcc/'
182
+ SEQ_PER_SAMPLE: 1
183
+ SAMPLER: NodeDistributed
184
+ CACHE_MODE: True
185
+ CIRCULAR_CACHE_MODE: False
186
+ ZIP_MODE: False
187
+ CACHE_ORIGIN_IMAGE: False
188
+ RANDOM_CAPTION: True
189
+ AS_NUMPY_AS_POSSIBLE: False
190
+ SAMPLING_WEIGHT: 0.5840
191
+ TRANSFORM: 'clip_transforms'
192
+ MODEL:
193
+ MAX_SEQ_LEN: 50
194
+ TEMP_NAME: logit_scale_caption
195
+ LOSSES:
196
+ NAMES: ['CrossEntropy', 'Accuracy']
197
+ LOSS_WEIGHT: 1.0
198
+ REDUCTION: 'mean'
199
+ INFERENCE:
200
+ VOCAB: 'CLIP'
201
+ GENERATION_MODE: False
202
+
203
+ -
204
+ NAME: cc12m_caption
205
+ DATASETS:
206
+ TRAIN: 'ImageTextPairDataset'
207
+ TASK_TYPE: 'image_caption'
208
+ DATASET_NAME: 'CC12M'
209
+ TARGET_SET: ['Vocab_Word']
210
+ DATALOADER:
211
+ TRAIN_BATCH_SIZE: 300
212
+ TEST_BATCH_SIZE: 32
213
+ NUM_WORKERS: 2
214
+ S3_ANNO_FOLDER: 's3://cc12m/'
215
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
216
+ ANNO_FILENAME: 'train_available.json'
217
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
218
+ S3_PATH: 's3://cc12m/'
219
+ SEQ_PER_SAMPLE: 1
220
+ SAMPLER: NodeDistributed
221
+ CACHE_MODE: True
222
+ CIRCULAR_CACHE_MODE: False
223
+ ZIP_MODE: False
224
+ CACHE_ORIGIN_IMAGE: False
225
+ RANDOM_CAPTION: False
226
+ AS_NUMPY_AS_POSSIBLE: False
227
+ SAMPLING_WEIGHT: 0.5057
228
+ TRANSFORM: 'clip_transforms'
229
+ MODEL:
230
+ MAX_SEQ_LEN: 50
231
+ TEMP_NAME: logit_scale_caption
232
+ LOSSES:
233
+ NAMES: ['CrossEntropy', 'Accuracy']
234
+ LOSS_WEIGHT: 1.0
235
+ REDUCTION: 'mean'
236
+ INFERENCE:
237
+ VOCAB: 'CLIP'
238
+ GENERATION_MODE: False
239
+
240
+ -
241
+ NAME: cc3m_caption
242
+ DATASETS:
243
+ TRAIN: 'ImageTextPairDataset'
244
+ TASK_TYPE: 'image_caption'
245
+ DATASET_NAME: 'CC3M'
246
+ TARGET_SET: ['Vocab_Word']
247
+ DATALOADER:
248
+ TRAIN_BATCH_SIZE: 300
249
+ TEST_BATCH_SIZE: 32
250
+ NUM_WORKERS: 2
251
+ S3_ANNO_FOLDER: 's3://cc3m/'
252
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
253
+ ANNO_FILENAME: 'train_spacy.json'
254
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
255
+ S3_PATH: 's3://cc3m/'
256
+ SEQ_PER_SAMPLE: 1
257
+ SAMPLER: NodeDistributed
258
+ CACHE_MODE: True
259
+ CIRCULAR_CACHE_MODE: False
260
+ ZIP_MODE: False
261
+ CACHE_ORIGIN_IMAGE: False
262
+ RANDOM_CAPTION: False
263
+ AS_NUMPY_AS_POSSIBLE: False
264
+ SAMPLING_WEIGHT: 0.26295
265
+ TRANSFORM: 'clip_transforms'
266
+ MODEL:
267
+ MAX_SEQ_LEN: 50
268
+ TEMP_NAME: logit_scale_caption
269
+ LOSSES:
270
+ NAMES: ['CrossEntropy', 'Accuracy']
271
+ LOSS_WEIGHT: 1.0
272
+ REDUCTION: 'mean'
273
+ INFERENCE:
274
+ VOCAB: 'CLIP'
275
+ GENERATION_MODE: False
276
+
277
+ -
278
+ NAME: vg_caption
279
+ DATASETS:
280
+ TRAIN: 'ImageTextPairDataset'
281
+ TASK_TYPE: 'image_caption'
282
+ DATASET_NAME: 'VG'
283
+ TARGET_SET: ['Vocab_Word']
284
+ DATALOADER:
285
+ TRAIN_BATCH_SIZE: 300
286
+ TEST_BATCH_SIZE: 32
287
+ NUM_WORKERS: 2
288
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
289
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
290
+ S3_PATH: 's3://visual_genome/images'
291
+ ANNO_FILENAME: 'vg_captions_128filter.json'
292
+ SEQ_PER_SAMPLE: 1
293
+ CACHE_MODE: True
294
+ CIRCULAR_CACHE_MODE: False
295
+ ZIP_MODE: False
296
+ CACHE_ORIGIN_IMAGE: False
297
+ RANDOM_CAPTION: False
298
+ AS_NUMPY_AS_POSSIBLE: False
299
+ SAMPLING_WEIGHT: 0.1766
300
+ TRANSFORM: 'clip_transforms'
301
+ MODEL:
302
+ MAX_SEQ_LEN: 30
303
+ TEMP_NAME: logit_scale_caption
304
+ LOSSES:
305
+ NAMES: ['CrossEntropy', 'Accuracy']
306
+ LOSS_WEIGHT: 1.0
307
+ REDUCTION: 'mean'
308
+ INFERENCE:
309
+ VOCAB: 'CLIP'
310
+ GENERATION_MODE: True
311
+
312
+
313
+ -
314
+ NAME: mscoco_caption
315
+ DATASETS:
316
+ TRAIN: 'ImageTextPairDataset'
317
+ # VAL: 'ImageTextPairDataset'
318
+ # TEST: 'ImageTextPairDataset'
319
+ TASK_TYPE: 'image_caption'
320
+ DATASET_NAME: 'MSCOCO'
321
+ TARGET_SET: ['Vocab_Word']
322
+ DATALOADER:
323
+ TRAIN_BATCH_SIZE: 300
324
+ TEST_BATCH_SIZE: 32
325
+ NUM_WORKERS: 1
326
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
327
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
328
+ S3_PATH: 's3://coco/'
329
+ SEQ_PER_SAMPLE: 1
330
+ CACHE_MODE: True
331
+ CIRCULAR_CACHE_MODE: False
332
+ ZIP_MODE: False
333
+ CACHE_ORIGIN_IMAGE: False
334
+ RANDOM_CAPTION: False
335
+ AS_NUMPY_AS_POSSIBLE: False
336
+ SAMPLING_WEIGHT: 0.1144
337
+ TRANSFORM: 'clip_transforms'
338
+ RANDOM_MASK: True
339
+ MODEL:
340
+ MAX_SEQ_LEN: 50
341
+ EVAL_MAX_SEQ_LEN: 21
342
+ TEMP_NAME: logit_scale_caption
343
+ LOSSES:
344
+ NAMES: ['CrossEntropy', 'Accuracy']
345
+ LOSS_WEIGHT: 1.0
346
+ REDUCTION: 'mean'
347
+ DECODE_STRATEGY:
348
+ NAME: 'CaptionBeamSearcherV3'
349
+ BEAM_SIZE: 2
350
+ # LEN_PENALTY: 1.0
351
+ INFERENCE:
352
+ NAME: 'COCOEvaler'
353
+ VOCAB: 'CLIP'
354
+ ID_KEY: 'image_id'
355
+ VALUE: 'caption'
356
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
357
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
358
+ GENERATION_MODE: True
359
+
360
+ -
361
+ NAME: sbu_caption
362
+ DATASETS:
363
+ TRAIN: 'ImageTextPairDataset'
364
+ TASK_TYPE: 'image_caption'
365
+ DATASET_NAME: 'SBU'
366
+ TARGET_SET: ['Vocab_Word']
367
+ DATALOADER:
368
+ TRAIN_BATCH_SIZE: 300
369
+ TEST_BATCH_SIZE: 32
370
+ NUM_WORKERS: 1
371
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
372
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
373
+ ANNO_FILENAME: 'subcaption.json'
374
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
375
+ S3_PATH: 's3://SBU/images'
376
+ SEQ_PER_SAMPLE: 1
377
+ SAMPLER: NodeDistributed
378
+ CACHE_MODE: True
379
+ CIRCULAR_CACHE_MODE: False
380
+ ZIP_MODE: False
381
+ CACHE_ORIGIN_IMAGE: False
382
+ RANDOM_CAPTION: False
383
+ AS_NUMPY_AS_POSSIBLE: False
384
+ SAMPLING_WEIGHT: 0.1383
385
+ TRANSFORM: 'clip_transforms'
386
+ MODEL:
387
+ MAX_SEQ_LEN: 50
388
+ TEMP_NAME: logit_scale_caption
389
+ LOSSES:
390
+ NAMES: ['CrossEntropy', 'Accuracy']
391
+ LOSS_WEIGHT: 1.0
392
+ REDUCTION: 'mean'
393
+ INFERENCE:
394
+ VOCAB: 'CLIP'
395
+ GENERATION_MODE: False
396
+
397
+ -
398
+ NAME: yfcc_retrieve
399
+ DATASETS:
400
+ TRAIN: 'ImageTextPairDataset'
401
+ TASK_TYPE: 'image_retrieval'
402
+ DATASET_NAME: 'YFCC'
403
+ DATALOADER:
404
+ TRAIN_BATCH_SIZE: 512
405
+ TEST_BATCH_SIZE: 32
406
+ NUM_WORKERS: 2
407
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
408
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
409
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
410
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
411
+ S3_PATH: 'cluster2:s3://yfcc/'
412
+ SAMPLER: NodeDistributed
413
+ CACHE_MODE: True
414
+ CIRCULAR_CACHE_MODE: False
415
+ ZIP_MODE: False
416
+ CACHE_ORIGIN_IMAGE: False
417
+ RANDOM_CAPTION: True
418
+ AS_NUMPY_AS_POSSIBLE: False
419
+ SAMPLING_WEIGHT: 0.5840
420
+ TRANSFORM: 'clip_transforms'
421
+ MODEL:
422
+ MAX_SEQ_LEN: 50
423
+ TEMP_NAME: logit_scale_retrieve
424
+ LOSSES:
425
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
426
+ LABELSMOOTHING: 0.1
427
+ LOSS_WEIGHT: 0.5
428
+ REDUCTION: 'mean'
429
+ INFERENCE:
430
+ VOCAB: 'CLIP'
431
+ GENERATION_MODE: False
432
+
433
+ -
434
+ NAME: cc12m_retrieve
435
+ DATASETS:
436
+ TRAIN: 'ImageTextPairDataset'
437
+ TASK_TYPE: 'image_retrieval'
438
+ DATASET_NAME: 'CC12M'
439
+ DATALOADER:
440
+ TRAIN_BATCH_SIZE: 512
441
+ TEST_BATCH_SIZE: 32
442
+ NUM_WORKERS: 2
443
+ S3_ANNO_FOLDER: 's3://cc12m/'
444
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
445
+ ANNO_FILENAME: 'train_available.json'
446
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
447
+ S3_PATH: 's3://cc12m/'
448
+ SAMPLER: NodeDistributed
449
+ CACHE_MODE: True
450
+ CIRCULAR_CACHE_MODE: False
451
+ ZIP_MODE: False
452
+ CACHE_ORIGIN_IMAGE: False
453
+ RANDOM_CAPTION: False
454
+ AS_NUMPY_AS_POSSIBLE: False
455
+ SAMPLING_WEIGHT: 0.5057
456
+ TRANSFORM: 'clip_transforms'
457
+ MODEL:
458
+ MAX_SEQ_LEN: 50
459
+ TEMP_NAME: logit_scale_retrieve
460
+ LOSSES:
461
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
462
+ LABELSMOOTHING: 0.1
463
+ LOSS_WEIGHT: 0.5
464
+ REDUCTION: 'mean'
465
+ INFERENCE:
466
+ VOCAB: 'CLIP'
467
+ GENERATION_MODE: False
468
+
469
+ -
470
+ NAME: cc3m_retrieve
471
+ DATASETS:
472
+ TRAIN: 'ImageTextPairDataset'
473
+ TASK_TYPE: 'image_retrieval'
474
+ DATASET_NAME: 'CC3M'
475
+ DATALOADER:
476
+ TRAIN_BATCH_SIZE: 512
477
+ TEST_BATCH_SIZE: 32
478
+ NUM_WORKERS: 2
479
+ S3_ANNO_FOLDER: 's3://cc3m/'
480
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
481
+ ANNO_FILENAME: 'train_spacy.json'
482
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
483
+ S3_PATH: 's3://cc3m/'
484
+ SAMPLER: NodeDistributed
485
+ CACHE_MODE: True
486
+ CIRCULAR_CACHE_MODE: False
487
+ ZIP_MODE: False
488
+ CACHE_ORIGIN_IMAGE: False
489
+ RANDOM_CAPTION: False
490
+ AS_NUMPY_AS_POSSIBLE: False
491
+ SAMPLING_WEIGHT: 0.26295
492
+ TRANSFORM: 'clip_transforms'
493
+ MODEL:
494
+ MAX_SEQ_LEN: 50
495
+ TEMP_NAME: logit_scale_retrieve
496
+ LOSSES:
497
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
498
+ LABELSMOOTHING: 0.1
499
+ LOSS_WEIGHT: 0.5
500
+ REDUCTION: 'mean'
501
+ INFERENCE:
502
+ VOCAB: 'CLIP'
503
+ GENERATION_MODE: False
504
+
505
+ -
506
+ NAME: vg_retrieve
507
+ DATASETS:
508
+ TRAIN: 'ImageTextPairDataset'
509
+ TASK_TYPE: 'image_retrieval'
510
+ DATASET_NAME: 'VG'
511
+ DATALOADER:
512
+ TRAIN_BATCH_SIZE: 512
513
+ TEST_BATCH_SIZE: 32
514
+ NUM_WORKERS: 2
515
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
516
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
517
+ S3_PATH: 's3://visual_genome/images'
518
+ ANNO_FILENAME: 'vg_captions_128filter.json'
519
+ SEQ_PER_SAMPLE: 1
520
+ CACHE_MODE: True
521
+ CIRCULAR_CACHE_MODE: False
522
+ ZIP_MODE: False
523
+ CACHE_ORIGIN_IMAGE: False
524
+ RANDOM_CAPTION: False
525
+ AS_NUMPY_AS_POSSIBLE: False
526
+ SAMPLING_WEIGHT: 0.1766
527
+ TRANSFORM: 'clip_transforms'
528
+ MODEL:
529
+ MAX_SEQ_LEN: 30
530
+ TEMP_NAME: logit_scale_retrieve
531
+ LOSSES:
532
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
533
+ LABELSMOOTHING: 0.1
534
+ LOSS_WEIGHT: 0.5
535
+ REDUCTION: 'mean'
536
+ INFERENCE:
537
+ VOCAB: 'CLIP'
538
+ GENERATION_MODE: False
539
+
540
+ -
541
+ NAME: mscoco_retrieve
542
+ DATASETS:
543
+ TRAIN: 'ImageTextPairDataset'
544
+ # TEST: 'ImageTextPairDataset'
545
+ TASK_TYPE: 'image_retrieval'
546
+ DATASET_NAME: 'MSCOCO'
547
+ DATALOADER:
548
+ TRAIN_BATCH_SIZE: 512
549
+ TEST_BATCH_SIZE: 32
550
+ NUM_WORKERS: 1
551
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
552
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
553
+ S3_PATH: 's3://coco/'
554
+ SEQ_PER_SAMPLE: 1
555
+ CACHE_MODE: True
556
+ CIRCULAR_CACHE_MODE: False
557
+ ZIP_MODE: False
558
+ CACHE_ORIGIN_IMAGE: False
559
+ RANDOM_CAPTION: False
560
+ AS_NUMPY_AS_POSSIBLE: False
561
+ SAMPLING_WEIGHT: 0.1144
562
+ TRANSFORM: 'clip_transforms'
563
+ MODEL:
564
+ MAX_SEQ_LEN: 50
565
+ TEMP_NAME: logit_scale_retrieve
566
+ LOSSES:
567
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
568
+ LABELSMOOTHING: 0.1
569
+ LOSS_WEIGHT: 0.5
570
+ REDUCTION: 'mean'
571
+ INFERENCE:
572
+ VOCAB: 'CLIP'
573
+ ID_KEY: 'image_id'
574
+ VALUE: 'caption'
575
+ NAME: 'RetrievalEvaler'
576
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
577
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
578
+ GENERATION_MODE: False
579
+
580
+ -
581
+ NAME: sbu_retrieve
582
+ DATASETS:
583
+ TRAIN: 'ImageTextPairDataset'
584
+ TASK_TYPE: 'image_retrieval'
585
+ DATASET_NAME: 'SBU'
586
+ DATALOADER:
587
+ TRAIN_BATCH_SIZE: 512
588
+ TEST_BATCH_SIZE: 32
589
+ NUM_WORKERS: 1
590
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
591
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
592
+ ANNO_FILENAME: 'subcaption.json'
593
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
594
+ S3_PATH: 's3://SBU/images'
595
+ SAMPLER: NodeDistributed
596
+ CACHE_MODE: True
597
+ CIRCULAR_CACHE_MODE: False
598
+ ZIP_MODE: False
599
+ CACHE_ORIGIN_IMAGE: False
600
+ RANDOM_CAPTION: False
601
+ AS_NUMPY_AS_POSSIBLE: False
602
+ SAMPLING_WEIGHT: 0.1383
603
+ TRANSFORM: 'clip_transforms'
604
+ MODEL:
605
+ MAX_SEQ_LEN: 50
606
+ TEMP_NAME: logit_scale_retrieve
607
+ LOSSES:
608
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
609
+ LABELSMOOTHING: 0.1
610
+ LOSS_WEIGHT: 0.5
611
+ REDUCTION: 'mean'
612
+ INFERENCE:
613
+ VOCAB: 'CLIP'
614
+ GENERATION_MODE: False
615
+
616
+
617
+ ENGINE:
618
+ NAME: 'UnifiedTrainer'
619
+
620
+ MODEL:
621
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
622
+ ENCODER: 'UnifiedBertEncoder'
623
+
624
+
625
+ SHARE_LAYERNORM: True
626
+ BERT:
627
+ NORMALIZE_DECISION: "BERTPre"
628
+ DROP_PATH_PROB: 0.1
629
+ DROP_PATH_PROB_FIXED: True
630
+
631
+ UNIFY_QKV: True
632
+
633
+ MODEL_EMA: False
634
+ MODEL_EMA_DECAY: 0.9999
635
+
636
+ MAEParamsInit: True
637
+ POSEMBEDFIX: True
638
+
639
+
640
+ IMG_INPUT_SIZE: 160
641
+ PATCH_SIZE: 16
642
+
643
+ LAYER_SCALE: True
644
+ LAYER_SCALE_INIT: 1e-3
645
+ OLD_CHECKPONT: True
646
+
647
+
648
+ DATALOADER:
649
+ USE_WEIGHTED_SAMPLER: True
650
+ UNIFIED_DATASET: True
651
+ NUM_WORKERS: 32
652
+
653
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
654
+
655
+
656
+
657
+ ####################################### Optimizer #######################################
658
+ SOLVER:
659
+ NAME: 'Adam'
660
+ TORCH_OPTIMIZER: True
661
+ PARAMS_SEPERATE: True
662
+ # PARAMS_GROUP: True
663
+ # EPOCH: 1
664
+ MAX_ITER: 200000
665
+ CHECKPOINT_PERIOD: 10000
666
+ EVAL_PERIOD: 10000000
667
+ BASE_LR: 0.001
668
+ BIAS_LR_FACTOR: 1.0
669
+ WEIGHT_DECAY: 0.2
670
+ WEIGHT_DECAY_NORM: 0.0
671
+ WEIGHT_DECAY_BIAS: 0.0
672
+ WEIGHT_DECAY_EMBEDDING: 0.0
673
+ MOMENTUM: 0.9
674
+ DAMPENING: 0.0
675
+ NESTEROV: 0.0
676
+ BETAS: [0.9, 0.95]
677
+ EPS: 1e-6
678
+ GRAD_CLIP: 0.1
679
+ GRAD_CLIP_TYPE: 'norm'
680
+ ACCUM_ITER: 0
681
+ AMP_FP16: True
682
+ APEX_FP16: False # dangerous
683
+
684
+ WRITE_PERIOD: 50
685
+ MIN_LOSS_SCLE: 2048.0
686
+ # BF16: False # True
687
+ # ZEROSTAGE: 2
688
+
689
+ LOSS_SCALE_WINDOW: 200
690
+
691
+
692
+ ####################################### lr scheduler #######################################
693
+ LR_SCHEDULER:
694
+ NAME: 'WarmupCosine'
695
+ WARMUP: 10000
696
+ MIN_LR: 0.000001
697
+
698
+ ####################################### evaluation #######################################
699
+ INFERENCE:
700
+
701
+ VOCAB: 'CLIP'
702
+ ITER_BASED: True
703
+
704
+
705
+ find_unused_parameters: true
706
+
707
+ # ENCODERS:
708
+ # -
709
+ # NAME: VisualEncoder
710
+ # TYPE: VisualEncoder
711
+ # DROP_PATH_PROB: 0.0
712
+ # HIDDEN_SIZE: 192
713
+ # HIDDEN_DROPOUT_PROB: 0.
714
+ # HIDDEN_ACT: "gelu"
715
+ # NUM_ATTENTION_HEADS: 3
716
+ # INTERMEDIATE_SIZE: 768
717
+ # INTERMEDIATE_DROP: 0.
718
+ # FFN_DROPOUT_PROB: 0.
719
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
720
+ # NUM_HIDDEN_LAYERS: 6
721
+ # NUM_GENERATION_LAYERS: 0
722
+ # DROP_PATH_PROB_FIXED: True
723
+
724
+ # -
725
+ # NAME: TextEncoder
726
+ # TYPE: TextEncoder
727
+ # DROP_PATH_PROB: 0.0
728
+ # HIDDEN_SIZE: 192
729
+ # HIDDEN_DROPOUT_PROB: 0.
730
+ # HIDDEN_ACT: "gelu"
731
+ # NUM_ATTENTION_HEADS: 3
732
+ # INTERMEDIATE_SIZE: 768
733
+ # INTERMEDIATE_DROP: 0.
734
+ # FFN_DROPOUT_PROB: 0.
735
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
736
+ # NUM_HIDDEN_LAYERS: 6
737
+ # NUM_GENERATION_LAYERS: 0
738
+ # DROP_PATH_PROB_FIXED: True
739
+
configs/BERT_L12_H768_experiments/16tasks_training_basedense_stage2_64gpu.yaml ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet22k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ -
18
+ NAME: 'MomentsInTime'
19
+ SHARED_TARGETS_CFG:
20
+ FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
21
+ DISTRIBUTED: False
22
+
23
+ -
24
+ NAME: 'Kinetics700'
25
+ SHARED_TARGETS_CFG:
26
+ FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
27
+ DISTRIBUTED: False
28
+
29
+ TASKS:
30
+
31
+ -
32
+ NAME: imagenet22k
33
+ DATASETS:
34
+ TRAIN: 'ImageNet22KDataset'
35
+ TASK_TYPE: 'image_classification'
36
+ DATASET_NAME: 'ImageNet22k'
37
+ TARGET_SET: ['ImageNet22k']
38
+
39
+ DATALOADER:
40
+ TRAIN_BATCH_SIZE: 440
41
+ # TEST_BATCH_SIZE: 2
42
+ NUM_WORKERS: 2
43
+ FEATS_FOLDER: 'open_source_dataset/imagenet22k'
44
+ S3_PATH: 'cluster2:s3://imagenet22k'
45
+ ANNO_FOLDER: 'open_source_dataset/'
46
+ SAMPLING_WEIGHT: 2.486
47
+ MIXUP: 0.0
48
+ CUTMIX: 0.0
49
+ MIXUP_PROB: 1.0
50
+ MIXUP_SWITCH_PROB: 0.5
51
+ MIXUP_MODE: 'batch'
52
+ MIXUP_LABEL_SMOOTHING: 0.1
53
+ MODEL:
54
+ MAX_SEQ_LEN: -1
55
+ LABELS_NUM: 21842
56
+ TEMP_NAME: logit_scale_img_cls
57
+ LOSSES:
58
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
59
+ LOSS_WEIGHT: 1.0
60
+ REDUCTION: 'mean'
61
+ LABELSMOOTHING: 0.1
62
+ INFERENCE:
63
+ NAME: 'ImageNetEvaler'
64
+ ID_KEY: 'image_id'
65
+ VALUE: 'cls_logits'
66
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
67
+ # VAL_ANNFILE: '/mnt/lustrenew/lihao2/projects/xmodaler_2/val_debug.txt'
68
+ TEST_ANNFILE: ''
69
+ GENERATION_MODE: False
70
+
71
+ -
72
+ NAME: K700_retrieve
73
+ DATASETS:
74
+ TRAIN: 'VideoDataSet'
75
+ TASK_TYPE: 'video_classification'
76
+ DATASET_NAME: 'K700'
77
+ TARGET_SET: ['Kinetics700']
78
+ DATALOADER:
79
+ TRAIN_BATCH_SIZE: 12
80
+ TEST_BATCH_SIZE: 24
81
+ NUM_WORKERS: 2
82
+ FEATS_FOLDER: 'open_source_dataset/K700'
83
+ ANNO_FOLDER: 'open_source_dataset/K700'
84
+ S3_PATH: 's3://K700/'
85
+ FRAMES_PER_CLIP: 8
86
+ STRIDE: 32
87
+ FILE_EXTENSION: ''
88
+ ANNO_FILE: 'annotation.json'
89
+ TIMESFORMER_AUG: True
90
+ SAMPLING_WEIGHT: 1.0
91
+
92
+ MODEL:
93
+ MAX_SEQ_LEN: -1
94
+ TEMP_NAME: logit_scale_video_cls
95
+ LOSSES:
96
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
97
+ LABELSMOOTHING: 0.1
98
+ LOSS_WEIGHT: 0.05
99
+ INFERENCE:
100
+ VOCAB: 'CLIP'
101
+ GENERATION_MODE: False
102
+
103
+ -
104
+ NAME: MomentsInTime
105
+ DATASETS:
106
+ TRAIN: 'VideoDataSet'
107
+ TASK_TYPE: 'video_classification'
108
+ DATASET_NAME: 'MiT'
109
+ TARGET_SET: ['MomentsInTime']
110
+ DATALOADER:
111
+ TRAIN_BATCH_SIZE: 68
112
+ TEST_BATCH_SIZE: 8
113
+ NUM_WORKERS: 2
114
+ FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
115
+ ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
116
+ S3_PATH: 's3://MomentsInTime/'
117
+ FRAMES_PER_CLIP: 3
118
+ STRIDE: 32
119
+ FILE_EXTENSION: ''
120
+ ANNO_FILE: 'annotation.json'
121
+ TIMESFORMER_AUG: True
122
+ SAMPLING_WEIGHT: 0.2
123
+
124
+ MODEL:
125
+ MAX_SEQ_LEN: -1
126
+ TEMP_NAME: logit_scale_video_cls
127
+ LOSSES:
128
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
129
+ LABELSMOOTHING: 0.1
130
+ LOSS_WEIGHT: 0.05
131
+ INFERENCE:
132
+ NAME: 'MiTEvaler'
133
+ ID_KEY: 'video_name'
134
+ VALUE: 'label'
135
+ VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
136
+ TEST_ANNFILE: ''
137
+ GENERATION_MODE: False
138
+ NUM_VIEWS: 1
139
+
140
+ -
141
+ NAME: bookswiki_pretrain
142
+ DATASETS:
143
+ TRAIN: 'GeneralCorpusDataset'
144
+ TASK_TYPE: 'text_mlm'
145
+ DATASET_NAME: 'BooksWiki'
146
+ TARGET_SET: ['Vocab_Word']
147
+ VERSION: 'v2'
148
+ DATALOADER:
149
+ TRAIN_BATCH_SIZE: 512
150
+ TEST_BATCH_SIZE: 32
151
+ NUM_WORKERS: 2
152
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
153
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
154
+ SEQ_PER_SAMPLE: 1
155
+ SAMPLER: NodeDistributed
156
+ CACHE_MODE: True
157
+ SEQ_PER_SAMPLE: 128
158
+ MIN_SEQ_PER_SAMPLE: 128
159
+ APPEND_EOS: True
160
+ ONE_STREAM: False
161
+ SAMPLING_WEIGHT: 2.75
162
+ RANDOM_MASK: True
163
+ MODEL:
164
+ MAX_SEQ_LEN: 128
165
+ TEMP_NAME: logit_scale_text_mlm
166
+ LOSSES:
167
+ NAMES: ['CrossEntropy', 'Accuracy']
168
+ LOSS_WEIGHT: 0.25
169
+ REDUCTION: 'mean'
170
+ INFERENCE:
171
+ VOCAB: 'CLIP'
172
+ GENERATION_MODE: False
173
+
174
+
175
+ -
176
+ NAME: yfcc_caption
177
+ DATASETS:
178
+ TRAIN: 'ImageTextPairDataset'
179
+ TASK_TYPE: 'image_caption'
180
+ DATASET_NAME: 'YFCC'
181
+ TARGET_SET: ['Vocab_Word']
182
+ DATALOADER:
183
+ TRAIN_BATCH_SIZE: 200
184
+ TEST_BATCH_SIZE: 32
185
+ NUM_WORKERS: 2
186
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
187
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
188
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
189
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
190
+ S3_PATH: 'cluster2:s3://yfcc/'
191
+ SEQ_PER_SAMPLE: 1
192
+ SAMPLER: NodeDistributed
193
+ CACHE_MODE: True
194
+ CIRCULAR_CACHE_MODE: False
195
+ ZIP_MODE: False
196
+ CACHE_ORIGIN_IMAGE: False
197
+ RANDOM_CAPTION: False
198
+ AS_NUMPY_AS_POSSIBLE: False
199
+ SAMPLING_WEIGHT: 0.5840
200
+ TRANSFORM: 'clip_transforms'
201
+ MODEL:
202
+ MAX_SEQ_LEN: 50
203
+ TEMP_NAME: logit_scale_caption
204
+ LOSSES:
205
+ NAMES: ['CrossEntropy', 'Accuracy']
206
+ LOSS_WEIGHT: 0.5
207
+ REDUCTION: 'mean'
208
+ INFERENCE:
209
+ VOCAB: 'CLIP'
210
+ GENERATION_MODE: False
211
+
212
+ -
213
+ NAME: cc12m_caption
214
+ DATASETS:
215
+ TRAIN: 'ImageTextPairDataset'
216
+ TASK_TYPE: 'image_caption'
217
+ DATASET_NAME: 'CC12M'
218
+ TARGET_SET: ['Vocab_Word']
219
+ DATALOADER:
220
+ TRAIN_BATCH_SIZE: 200
221
+ TEST_BATCH_SIZE: 32
222
+ NUM_WORKERS: 2
223
+ S3_ANNO_FOLDER: 's3://cc12m/'
224
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
225
+ ANNO_FILENAME: 'train_available.json'
226
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
227
+ S3_PATH: 's3://cc12m/'
228
+ SEQ_PER_SAMPLE: 1
229
+ SAMPLER: NodeDistributed
230
+ CACHE_MODE: True
231
+ CIRCULAR_CACHE_MODE: False
232
+ ZIP_MODE: False
233
+ CACHE_ORIGIN_IMAGE: False
234
+ RANDOM_CAPTION: False
235
+ AS_NUMPY_AS_POSSIBLE: False
236
+ SAMPLING_WEIGHT: 0.5057
237
+ TRANSFORM: 'clip_transforms'
238
+ MODEL:
239
+ MAX_SEQ_LEN: 50
240
+ TEMP_NAME: logit_scale_caption
241
+ LOSSES:
242
+ NAMES: ['CrossEntropy', 'Accuracy']
243
+ LOSS_WEIGHT: 0.5
244
+ REDUCTION: 'mean'
245
+ INFERENCE:
246
+ VOCAB: 'CLIP'
247
+ GENERATION_MODE: False
248
+
249
+ -
250
+ NAME: cc3m_caption
251
+ DATASETS:
252
+ TRAIN: 'ImageTextPairDataset'
253
+ TASK_TYPE: 'image_caption'
254
+ DATASET_NAME: 'CC3M'
255
+ TARGET_SET: ['Vocab_Word']
256
+ DATALOADER:
257
+ TRAIN_BATCH_SIZE: 200
258
+ TEST_BATCH_SIZE: 32
259
+ NUM_WORKERS: 2
260
+ S3_ANNO_FOLDER: 's3://cc3m/'
261
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
262
+ ANNO_FILENAME: 'train_spacy.json'
263
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
264
+ S3_PATH: 's3://cc3m/'
265
+ SEQ_PER_SAMPLE: 1
266
+ SAMPLER: NodeDistributed
267
+ CACHE_MODE: True
268
+ CIRCULAR_CACHE_MODE: False
269
+ ZIP_MODE: False
270
+ CACHE_ORIGIN_IMAGE: False
271
+ RANDOM_CAPTION: False
272
+ AS_NUMPY_AS_POSSIBLE: False
273
+ SAMPLING_WEIGHT: 0.26295
274
+ TRANSFORM: 'clip_transforms'
275
+ MODEL:
276
+ MAX_SEQ_LEN: 50
277
+ TEMP_NAME: logit_scale_caption
278
+ LOSSES:
279
+ NAMES: ['CrossEntropy', 'Accuracy']
280
+ LOSS_WEIGHT: 0.5
281
+ REDUCTION: 'mean'
282
+ INFERENCE:
283
+ VOCAB: 'CLIP'
284
+ GENERATION_MODE: False
285
+
286
+ -
287
+ NAME: vg_caption
288
+ DATASETS:
289
+ TRAIN: 'ImageTextPairDataset'
290
+ TASK_TYPE: 'image_caption'
291
+ DATASET_NAME: 'VG'
292
+ TARGET_SET: ['Vocab_Word']
293
+ DATALOADER:
294
+ TRAIN_BATCH_SIZE: 200
295
+ TEST_BATCH_SIZE: 32
296
+ NUM_WORKERS: 2
297
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
298
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
299
+ S3_PATH: 's3://visual_genome/images'
300
+ ANNO_FILENAME: 'vg_captions_128filter.json'
301
+ SEQ_PER_SAMPLE: 1
302
+ CACHE_MODE: True
303
+ CIRCULAR_CACHE_MODE: False
304
+ ZIP_MODE: False
305
+ CACHE_ORIGIN_IMAGE: False
306
+ RANDOM_CAPTION: False
307
+ AS_NUMPY_AS_POSSIBLE: False
308
+ SAMPLING_WEIGHT: 0.1766
309
+ TRANSFORM: 'clip_transforms'
310
+ MODEL:
311
+ MAX_SEQ_LEN: 30
312
+ TEMP_NAME: logit_scale_caption
313
+ LOSSES:
314
+ NAMES: ['CrossEntropy', 'Accuracy']
315
+ LOSS_WEIGHT: 0.5
316
+ REDUCTION: 'mean'
317
+ INFERENCE:
318
+ VOCAB: 'CLIP'
319
+ GENERATION_MODE: True
320
+
321
+
322
+ -
323
+ NAME: mscoco_caption
324
+ DATASETS:
325
+ TRAIN: 'ImageTextPairDataset'
326
+ # VAL: 'ImageTextPairDataset'
327
+ # TEST: 'ImageTextPairDataset'
328
+ TASK_TYPE: 'image_caption'
329
+ DATASET_NAME: 'MSCOCO'
330
+ TARGET_SET: ['Vocab_Word']
331
+ DATALOADER:
332
+ TRAIN_BATCH_SIZE: 200
333
+ TEST_BATCH_SIZE: 32
334
+ NUM_WORKERS: 1
335
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
336
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
337
+ S3_PATH: 's3://coco/'
338
+ SEQ_PER_SAMPLE: 1
339
+ CACHE_MODE: True
340
+ CIRCULAR_CACHE_MODE: False
341
+ ZIP_MODE: False
342
+ CACHE_ORIGIN_IMAGE: False
343
+ RANDOM_CAPTION: False
344
+ AS_NUMPY_AS_POSSIBLE: False
345
+ SAMPLING_WEIGHT: 0.1144
346
+ TRANSFORM: 'clip_transforms'
347
+ RANDOM_MASK: True
348
+ MODEL:
349
+ MAX_SEQ_LEN: 50
350
+ EVAL_MAX_SEQ_LEN: 21
351
+ TEMP_NAME: logit_scale_caption
352
+ LOSSES:
353
+ NAMES: ['CrossEntropy', 'Accuracy']
354
+ LOSS_WEIGHT: 0.5
355
+ REDUCTION: 'mean'
356
+ DECODE_STRATEGY:
357
+ NAME: 'CaptionBeamSearcherV3'
358
+ BEAM_SIZE: 2
359
+ # LEN_PENALTY: 1.0
360
+ INFERENCE:
361
+ NAME: 'COCOEvaler'
362
+ VOCAB: 'CLIP'
363
+ ID_KEY: 'image_id'
364
+ VALUE: 'caption'
365
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
366
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
367
+ GENERATION_MODE: True
368
+
369
+ -
370
+ NAME: sbu_caption
371
+ DATASETS:
372
+ TRAIN: 'ImageTextPairDataset'
373
+ TASK_TYPE: 'image_caption'
374
+ DATASET_NAME: 'SBU'
375
+ TARGET_SET: ['Vocab_Word']
376
+ DATALOADER:
377
+ TRAIN_BATCH_SIZE: 200
378
+ TEST_BATCH_SIZE: 32
379
+ NUM_WORKERS: 1
380
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
381
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
382
+ ANNO_FILENAME: 'subcaption.json'
383
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
384
+ S3_PATH: 's3://SBU/images'
385
+ SEQ_PER_SAMPLE: 1
386
+ SAMPLER: NodeDistributed
387
+ CACHE_MODE: True
388
+ CIRCULAR_CACHE_MODE: False
389
+ ZIP_MODE: False
390
+ CACHE_ORIGIN_IMAGE: False
391
+ RANDOM_CAPTION: False
392
+ AS_NUMPY_AS_POSSIBLE: False
393
+ SAMPLING_WEIGHT: 0.1383
394
+ TRANSFORM: 'clip_transforms'
395
+ MODEL:
396
+ MAX_SEQ_LEN: 50
397
+ TEMP_NAME: logit_scale_caption
398
+ LOSSES:
399
+ NAMES: ['CrossEntropy', 'Accuracy']
400
+ LOSS_WEIGHT: 0.5
401
+ REDUCTION: 'mean'
402
+ INFERENCE:
403
+ VOCAB: 'CLIP'
404
+ GENERATION_MODE: False
405
+
406
+ -
407
+ NAME: yfcc_retrieve
408
+ DATASETS:
409
+ TRAIN: 'ImageTextPairDataset'
410
+ TASK_TYPE: 'image_retrieval'
411
+ DATASET_NAME: 'YFCC'
412
+ DATALOADER:
413
+ TRAIN_BATCH_SIZE: 320
414
+ TEST_BATCH_SIZE: 32
415
+ NUM_WORKERS: 2
416
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
417
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
418
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
419
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
420
+ S3_PATH: 'cluster2:s3://yfcc/'
421
+ SAMPLER: NodeDistributed
422
+ CACHE_MODE: True
423
+ CIRCULAR_CACHE_MODE: False
424
+ ZIP_MODE: False
425
+ CACHE_ORIGIN_IMAGE: False
426
+ RANDOM_CAPTION: False
427
+ AS_NUMPY_AS_POSSIBLE: False
428
+ SAMPLING_WEIGHT: 0.5840
429
+ TRANSFORM: 'clip_transforms'
430
+ MODEL:
431
+ MAX_SEQ_LEN: 50
432
+ TEMP_NAME: logit_scale_retrieve
433
+ LOSSES:
434
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
435
+ LABELSMOOTHING: 0.1
436
+ LOSS_WEIGHT: 0.25
437
+ REDUCTION: 'mean'
438
+ INFERENCE:
439
+ VOCAB: 'CLIP'
440
+ GENERATION_MODE: False
441
+
442
+ -
443
+ NAME: cc12m_retrieve
444
+ DATASETS:
445
+ TRAIN: 'ImageTextPairDataset'
446
+ TASK_TYPE: 'image_retrieval'
447
+ DATASET_NAME: 'CC12M'
448
+ DATALOADER:
449
+ TRAIN_BATCH_SIZE: 320
450
+ TEST_BATCH_SIZE: 32
451
+ NUM_WORKERS: 2
452
+ S3_ANNO_FOLDER: 's3://cc12m/'
453
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
454
+ ANNO_FILENAME: 'train_available.json'
455
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
456
+ S3_PATH: 's3://cc12m/'
457
+ SAMPLER: NodeDistributed
458
+ CACHE_MODE: True
459
+ CIRCULAR_CACHE_MODE: False
460
+ ZIP_MODE: False
461
+ CACHE_ORIGIN_IMAGE: False
462
+ RANDOM_CAPTION: False
463
+ AS_NUMPY_AS_POSSIBLE: False
464
+ SAMPLING_WEIGHT: 0.5057
465
+ TRANSFORM: 'clip_transforms'
466
+ MODEL:
467
+ MAX_SEQ_LEN: 50
468
+ TEMP_NAME: logit_scale_retrieve
469
+ LOSSES:
470
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
471
+ LABELSMOOTHING: 0.1
472
+ LOSS_WEIGHT: 0.25
473
+ REDUCTION: 'mean'
474
+ INFERENCE:
475
+ VOCAB: 'CLIP'
476
+ GENERATION_MODE: False
477
+
478
+ -
479
+ NAME: cc3m_retrieve
480
+ DATASETS:
481
+ TRAIN: 'ImageTextPairDataset'
482
+ TASK_TYPE: 'image_retrieval'
483
+ DATASET_NAME: 'CC3M'
484
+ DATALOADER:
485
+ TRAIN_BATCH_SIZE: 320
486
+ TEST_BATCH_SIZE: 32
487
+ NUM_WORKERS: 2
488
+ S3_ANNO_FOLDER: 's3://cc3m/'
489
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
490
+ ANNO_FILENAME: 'train_spacy.json'
491
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
492
+ S3_PATH: 's3://cc3m/'
493
+ SAMPLER: NodeDistributed
494
+ CACHE_MODE: True
495
+ CIRCULAR_CACHE_MODE: False
496
+ ZIP_MODE: False
497
+ CACHE_ORIGIN_IMAGE: False
498
+ RANDOM_CAPTION: False
499
+ AS_NUMPY_AS_POSSIBLE: False
500
+ SAMPLING_WEIGHT: 0.26295
501
+ TRANSFORM: 'clip_transforms'
502
+ MODEL:
503
+ MAX_SEQ_LEN: 50
504
+ TEMP_NAME: logit_scale_retrieve
505
+ LOSSES:
506
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
507
+ LABELSMOOTHING: 0.1
508
+ LOSS_WEIGHT: 0.25
509
+ REDUCTION: 'mean'
510
+ INFERENCE:
511
+ VOCAB: 'CLIP'
512
+ GENERATION_MODE: False
513
+
514
+ -
515
+ NAME: vg_retrieve
516
+ DATASETS:
517
+ TRAIN: 'ImageTextPairDataset'
518
+ TASK_TYPE: 'image_retrieval'
519
+ DATASET_NAME: 'VG'
520
+ DATALOADER:
521
+ TRAIN_BATCH_SIZE: 320
522
+ TEST_BATCH_SIZE: 32
523
+ NUM_WORKERS: 2
524
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
525
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
526
+ S3_PATH: 's3://visual_genome/images'
527
+ ANNO_FILENAME: 'vg_captions_128filter.json'
528
+ SEQ_PER_SAMPLE: 1
529
+ CACHE_MODE: True
530
+ CIRCULAR_CACHE_MODE: False
531
+ ZIP_MODE: False
532
+ CACHE_ORIGIN_IMAGE: False
533
+ RANDOM_CAPTION: False
534
+ AS_NUMPY_AS_POSSIBLE: False
535
+ SAMPLING_WEIGHT: 0.1766
536
+ TRANSFORM: 'clip_transforms'
537
+ MODEL:
538
+ MAX_SEQ_LEN: 30
539
+ TEMP_NAME: logit_scale_retrieve
540
+ LOSSES:
541
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
542
+ LABELSMOOTHING: 0.1
543
+ LOSS_WEIGHT: 0.25
544
+ REDUCTION: 'mean'
545
+ INFERENCE:
546
+ VOCAB: 'CLIP'
547
+ GENERATION_MODE: False
548
+
549
+ -
550
+ NAME: mscoco_retrieve
551
+ DATASETS:
552
+ TRAIN: 'ImageTextPairDataset'
553
+ # TEST: 'ImageTextPairDataset'
554
+ TASK_TYPE: 'image_retrieval'
555
+ DATASET_NAME: 'MSCOCO'
556
+ DATALOADER:
557
+ TRAIN_BATCH_SIZE: 320
558
+ TEST_BATCH_SIZE: 32
559
+ NUM_WORKERS: 1
560
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
561
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
562
+ S3_PATH: 's3://coco/'
563
+ SEQ_PER_SAMPLE: 1
564
+ CACHE_MODE: True
565
+ CIRCULAR_CACHE_MODE: False
566
+ ZIP_MODE: False
567
+ CACHE_ORIGIN_IMAGE: False
568
+ RANDOM_CAPTION: False
569
+ AS_NUMPY_AS_POSSIBLE: False
570
+ SAMPLING_WEIGHT: 0.1144
571
+ TRANSFORM: 'clip_transforms'
572
+ MODEL:
573
+ MAX_SEQ_LEN: 50
574
+ TEMP_NAME: logit_scale_retrieve
575
+ LOSSES:
576
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
577
+ LABELSMOOTHING: 0.1
578
+ LOSS_WEIGHT: 0.25
579
+ REDUCTION: 'mean'
580
+ INFERENCE:
581
+ VOCAB: 'CLIP'
582
+ ID_KEY: 'image_id'
583
+ VALUE: 'caption'
584
+ NAME: 'RetrievalEvaler'
585
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
586
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
587
+ GENERATION_MODE: False
588
+
589
+ -
590
+ NAME: sbu_retrieve
591
+ DATASETS:
592
+ TRAIN: 'ImageTextPairDataset'
593
+ TASK_TYPE: 'image_retrieval'
594
+ DATASET_NAME: 'SBU'
595
+ DATALOADER:
596
+ TRAIN_BATCH_SIZE: 320
597
+ TEST_BATCH_SIZE: 32
598
+ NUM_WORKERS: 1
599
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
600
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
601
+ ANNO_FILENAME: 'subcaption.json'
602
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
603
+ S3_PATH: 's3://SBU/images'
604
+ SAMPLER: NodeDistributed
605
+ CACHE_MODE: True
606
+ CIRCULAR_CACHE_MODE: False
607
+ ZIP_MODE: False
608
+ CACHE_ORIGIN_IMAGE: False
609
+ RANDOM_CAPTION: False
610
+ AS_NUMPY_AS_POSSIBLE: False
611
+ SAMPLING_WEIGHT: 0.1383
612
+ TRANSFORM: 'clip_transforms'
613
+ MODEL:
614
+ MAX_SEQ_LEN: 50
615
+ TEMP_NAME: logit_scale_retrieve
616
+ LOSSES:
617
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
618
+ LABELSMOOTHING: 0.1
619
+ LOSS_WEIGHT: 0.25
620
+ REDUCTION: 'mean'
621
+ INFERENCE:
622
+ VOCAB: 'CLIP'
623
+ GENERATION_MODE: False
624
+
625
+
626
+ ENGINE:
627
+ NAME: 'UnifiedTrainer'
628
+
629
+ MODEL:
630
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
631
+ ENCODER: 'UnifiedBertEncoder'
632
+
633
+
634
+ SHARE_LAYERNORM: True
635
+ BERT:
636
+ NORMALIZE_DECISION: "BERTPre"
637
+ DROP_PATH_PROB: 0.1
638
+ DROP_PATH_PROB_FIXED: True
639
+
640
+ UNIFY_QKV: True
641
+
642
+ MODEL_EMA: False
643
+ MODEL_EMA_DECAY: 0.9999
644
+
645
+ MAEParamsInit: True
646
+ POSEMBEDFIX: True
647
+
648
+
649
+ IMG_INPUT_SIZE: 224
650
+ PATCH_SIZE: 16
651
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
652
+ CHECKPOINT_FILETER: False
653
+
654
+ LAYER_SCALE: True
655
+ LAYER_SCALE_INIT: 1e-3
656
+ OLD_CHECKPONT: True
657
+
658
+
659
+ DATALOADER:
660
+ USE_WEIGHTED_SAMPLER: True
661
+ UNIFIED_DATASET: True
662
+ NUM_WORKERS: 32
663
+
664
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
665
+
666
+
667
+
668
+ ####################################### Optimizer #######################################
669
+ SOLVER:
670
+ NAME: 'Adam'
671
+ TORCH_OPTIMIZER: True
672
+ PARAMS_SEPERATE: True
673
+ # PARAMS_GROUP: True
674
+ # EPOCH: 1
675
+ MAX_ITER: 45000
676
+ CHECKPOINT_PERIOD: 5000
677
+ EVAL_PERIOD: 10000000
678
+ BASE_LR: 0.00002
679
+ BIAS_LR_FACTOR: 1.0
680
+ WEIGHT_DECAY: 0.05
681
+ WEIGHT_DECAY_NORM: 0.0
682
+ WEIGHT_DECAY_BIAS: 0.0
683
+ WEIGHT_DECAY_EMBEDDING: 0.0
684
+ MOMENTUM: 0.9
685
+ DAMPENING: 0.0
686
+ NESTEROV: 0.0
687
+ BETAS: [0.9, 0.95]
688
+ EPS: 1e-6
689
+ GRAD_CLIP: 0.1
690
+ GRAD_CLIP_TYPE: 'norm'
691
+ ACCUM_ITER: 0
692
+ AMP_FP16: True
693
+ APEX_FP16: False # dangerous
694
+
695
+ WRITE_PERIOD: 50
696
+ MIN_LOSS_SCLE: 2048.0
697
+ # BF16: False # True
698
+ # ZEROSTAGE: 2
699
+
700
+ LOSS_SCALE_WINDOW: 200
701
+
702
+
703
+ ####################################### lr scheduler #######################################
704
+ LR_SCHEDULER:
705
+ NAME: 'WarmupCosine'
706
+ WARMUP: 5000
707
+ MIN_LR: 0.000001
708
+
709
+ ####################################### evaluation #######################################
710
+ INFERENCE:
711
+
712
+ VOCAB: 'CLIP'
713
+ ITER_BASED: True
714
+
715
+
716
+ find_unused_parameters: true
717
+
718
+ # ENCODERS:
719
+ # -
720
+ # NAME: VisualEncoder
721
+ # TYPE: VisualEncoder
722
+ # DROP_PATH_PROB: 0.0
723
+ # HIDDEN_SIZE: 192
724
+ # HIDDEN_DROPOUT_PROB: 0.
725
+ # HIDDEN_ACT: "gelu"
726
+ # NUM_ATTENTION_HEADS: 3
727
+ # INTERMEDIATE_SIZE: 768
728
+ # INTERMEDIATE_DROP: 0.
729
+ # FFN_DROPOUT_PROB: 0.
730
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
731
+ # NUM_HIDDEN_LAYERS: 6
732
+ # NUM_GENERATION_LAYERS: 0
733
+ # DROP_PATH_PROB_FIXED: True
734
+
735
+ # -
736
+ # NAME: TextEncoder
737
+ # TYPE: TextEncoder
738
+ # DROP_PATH_PROB: 0.0
739
+ # HIDDEN_SIZE: 192
740
+ # HIDDEN_DROPOUT_PROB: 0.
741
+ # HIDDEN_ACT: "gelu"
742
+ # NUM_ATTENTION_HEADS: 3
743
+ # INTERMEDIATE_SIZE: 768
744
+ # INTERMEDIATE_DROP: 0.
745
+ # FFN_DROPOUT_PROB: 0.
746
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
747
+ # NUM_HIDDEN_LAYERS: 6
748
+ # NUM_GENERATION_LAYERS: 0
749
+ # DROP_PATH_PROB_FIXED: True
750
+
configs/BERT_L12_H768_experiments/16tasks_training_basemoe_stage1_56gpu.yaml ADDED
@@ -0,0 +1,733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet22k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ -
18
+ NAME: 'MomentsInTime'
19
+ SHARED_TARGETS_CFG:
20
+ FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
21
+ DISTRIBUTED: False
22
+
23
+ -
24
+ NAME: 'Kinetics700'
25
+ SHARED_TARGETS_CFG:
26
+ FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
27
+ DISTRIBUTED: False
28
+
29
+ TASKS:
30
+
31
+ -
32
+ NAME: imagenet22k
33
+ DATASETS:
34
+ TRAIN: 'ImageNet22KDataset'
35
+ TASK_TYPE: 'image_classification'
36
+ DATASET_NAME: 'ImageNet22k'
37
+ TARGET_SET: ['ImageNet22k']
38
+
39
+ DATALOADER:
40
+ TRAIN_BATCH_SIZE: 720
41
+ # TEST_BATCH_SIZE: 2
42
+ NUM_WORKERS: 2
43
+ FEATS_FOLDER: 'open_source_dataset/imagenet22k'
44
+ S3_PATH: 'cluster2:s3://imagenet22k'
45
+ ANNO_FOLDER: 'open_source_dataset/'
46
+ SAMPLING_WEIGHT: 2.486
47
+ MIXUP: 0.8
48
+ CUTMIX: 1.0
49
+ MIXUP_PROB: 1.0
50
+ MIXUP_SWITCH_PROB: 0.5
51
+ MIXUP_MODE: 'batch'
52
+ MIXUP_LABEL_SMOOTHING: 0.1
53
+ MODEL:
54
+ MAX_SEQ_LEN: -1
55
+ LABELS_NUM: 21842
56
+ TEMP_NAME: logit_scale_img_cls
57
+ LOSSES:
58
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
59
+ LOSS_WEIGHT: 1.0
60
+ REDUCTION: 'mean'
61
+
62
+ -
63
+ NAME: K700_retrieve
64
+ DATASETS:
65
+ TRAIN: 'VideoDataSet'
66
+ TASK_TYPE: 'video_classification'
67
+ DATASET_NAME: 'K700'
68
+ TARGET_SET: ['Kinetics700']
69
+ DATALOADER:
70
+ TRAIN_BATCH_SIZE: 64
71
+ TEST_BATCH_SIZE: 24
72
+ NUM_WORKERS: 2
73
+ FEATS_FOLDER: 'open_source_dataset/K700'
74
+ ANNO_FOLDER: 'open_source_dataset/K700'
75
+ S3_PATH: 's3://K700/'
76
+ FRAMES_PER_CLIP: 4
77
+ STRIDE: 32
78
+ FILE_EXTENSION: ''
79
+ ANNO_FILE: 'annotation.json'
80
+ TIMESFORMER_AUG: True
81
+ SAMPLING_WEIGHT: 0.76
82
+
83
+ MODEL:
84
+ MAX_SEQ_LEN: -1
85
+ TEMP_NAME: logit_scale_video_cls
86
+ LOSSES:
87
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
88
+ LABELSMOOTHING: 0.1
89
+ LOSS_WEIGHT: 0.1
90
+ INFERENCE:
91
+ VOCAB: 'CLIP'
92
+ GENERATION_MODE: False
93
+
94
+ -
95
+ NAME: MomentsInTime
96
+ DATASETS:
97
+ TRAIN: 'VideoDataSet'
98
+ TASK_TYPE: 'video_classification'
99
+ DATASET_NAME: 'MiT'
100
+ TARGET_SET: ['MomentsInTime']
101
+ DATALOADER:
102
+ TRAIN_BATCH_SIZE: 112
103
+ TEST_BATCH_SIZE: 8
104
+ NUM_WORKERS: 2
105
+ FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
106
+ ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
107
+ S3_PATH: 's3://MomentsInTime/'
108
+ FRAMES_PER_CLIP: 3
109
+ STRIDE: 32
110
+ FILE_EXTENSION: ''
111
+ ANNO_FILE: 'annotation.json'
112
+ TIMESFORMER_AUG: True
113
+ SAMPLING_WEIGHT: 0.44
114
+
115
+ MODEL:
116
+ MAX_SEQ_LEN: -1
117
+ TEMP_NAME: logit_scale_video_cls
118
+ LOSSES:
119
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
120
+ LABELSMOOTHING: 0.1
121
+ LOSS_WEIGHT: 0.1
122
+ INFERENCE:
123
+ NAME: 'MiTEvaler'
124
+ ID_KEY: 'video_name'
125
+ VALUE: 'label'
126
+ VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
127
+ TEST_ANNFILE: ''
128
+ GENERATION_MODE: False
129
+ NUM_VIEWS: 1
130
+
131
+ -
132
+ NAME: bookswiki_pretrain
133
+ DATASETS:
134
+ TRAIN: 'GeneralCorpusDataset'
135
+ TASK_TYPE: 'text_mlm'
136
+ DATASET_NAME: 'BooksWiki'
137
+ TARGET_SET: ['Vocab_Word']
138
+ VERSION: 'v2'
139
+ DATALOADER:
140
+ TRAIN_BATCH_SIZE: 512
141
+ TEST_BATCH_SIZE: 32
142
+ NUM_WORKERS: 2
143
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
144
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
145
+ SEQ_PER_SAMPLE: 1
146
+ SAMPLER: NodeDistributed
147
+ CACHE_MODE: True
148
+ SEQ_PER_SAMPLE: 128
149
+ MIN_SEQ_PER_SAMPLE: 128
150
+ APPEND_EOS: True
151
+ ONE_STREAM: False
152
+ SAMPLING_WEIGHT: 2.75
153
+ RANDOM_MASK: True
154
+ MODEL:
155
+ MAX_SEQ_LEN: 128
156
+ TEMP_NAME: logit_scale_text_mlm
157
+ LOSSES:
158
+ NAMES: ['CrossEntropy', 'Accuracy']
159
+ LOSS_WEIGHT: 0.5
160
+ REDUCTION: 'mean'
161
+ INFERENCE:
162
+ VOCAB: 'CLIP'
163
+ GENERATION_MODE: False
164
+
165
+
166
+ -
167
+ NAME: yfcc_caption
168
+ DATASETS:
169
+ TRAIN: 'ImageTextPairDataset'
170
+ TASK_TYPE: 'image_caption'
171
+ DATASET_NAME: 'YFCC'
172
+ TARGET_SET: ['Vocab_Word']
173
+ DATALOADER:
174
+ TRAIN_BATCH_SIZE: 300
175
+ TEST_BATCH_SIZE: 32
176
+ NUM_WORKERS: 2
177
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
178
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
179
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
180
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
181
+ S3_PATH: 'cluster2:s3://yfcc/'
182
+ SEQ_PER_SAMPLE: 1
183
+ SAMPLER: NodeDistributed
184
+ CACHE_MODE: True
185
+ CIRCULAR_CACHE_MODE: False
186
+ ZIP_MODE: False
187
+ CACHE_ORIGIN_IMAGE: False
188
+ RANDOM_CAPTION: True
189
+ AS_NUMPY_AS_POSSIBLE: False
190
+ SAMPLING_WEIGHT: 0.5840
191
+ TRANSFORM: 'clip_transforms'
192
+ MODEL:
193
+ MAX_SEQ_LEN: 50
194
+ TEMP_NAME: logit_scale_caption
195
+ LOSSES:
196
+ NAMES: ['CrossEntropy', 'Accuracy']
197
+ LOSS_WEIGHT: 1.0
198
+ REDUCTION: 'mean'
199
+ INFERENCE:
200
+ VOCAB: 'CLIP'
201
+ GENERATION_MODE: False
202
+
203
+ -
204
+ NAME: cc12m_caption
205
+ DATASETS:
206
+ TRAIN: 'ImageTextPairDataset'
207
+ TASK_TYPE: 'image_caption'
208
+ DATASET_NAME: 'CC12M'
209
+ TARGET_SET: ['Vocab_Word']
210
+ DATALOADER:
211
+ TRAIN_BATCH_SIZE: 300
212
+ TEST_BATCH_SIZE: 32
213
+ NUM_WORKERS: 2
214
+ S3_ANNO_FOLDER: 's3://cc12m/'
215
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
216
+ ANNO_FILENAME: 'train_available.json'
217
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
218
+ S3_PATH: 's3://cc12m/'
219
+ SEQ_PER_SAMPLE: 1
220
+ SAMPLER: NodeDistributed
221
+ CACHE_MODE: True
222
+ CIRCULAR_CACHE_MODE: False
223
+ ZIP_MODE: False
224
+ CACHE_ORIGIN_IMAGE: False
225
+ RANDOM_CAPTION: False
226
+ AS_NUMPY_AS_POSSIBLE: False
227
+ SAMPLING_WEIGHT: 0.5057
228
+ TRANSFORM: 'clip_transforms'
229
+ MODEL:
230
+ MAX_SEQ_LEN: 50
231
+ TEMP_NAME: logit_scale_caption
232
+ LOSSES:
233
+ NAMES: ['CrossEntropy', 'Accuracy']
234
+ LOSS_WEIGHT: 1.0
235
+ REDUCTION: 'mean'
236
+ INFERENCE:
237
+ VOCAB: 'CLIP'
238
+ GENERATION_MODE: False
239
+
240
+ -
241
+ NAME: cc3m_caption
242
+ DATASETS:
243
+ TRAIN: 'ImageTextPairDataset'
244
+ TASK_TYPE: 'image_caption'
245
+ DATASET_NAME: 'CC3M'
246
+ TARGET_SET: ['Vocab_Word']
247
+ DATALOADER:
248
+ TRAIN_BATCH_SIZE: 300
249
+ TEST_BATCH_SIZE: 32
250
+ NUM_WORKERS: 2
251
+ S3_ANNO_FOLDER: 's3://cc3m/'
252
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
253
+ ANNO_FILENAME: 'train_spacy.json'
254
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
255
+ S3_PATH: 's3://cc3m/'
256
+ SEQ_PER_SAMPLE: 1
257
+ SAMPLER: NodeDistributed
258
+ CACHE_MODE: True
259
+ CIRCULAR_CACHE_MODE: False
260
+ ZIP_MODE: False
261
+ CACHE_ORIGIN_IMAGE: False
262
+ RANDOM_CAPTION: False
263
+ AS_NUMPY_AS_POSSIBLE: False
264
+ SAMPLING_WEIGHT: 0.26295
265
+ TRANSFORM: 'clip_transforms'
266
+ MODEL:
267
+ MAX_SEQ_LEN: 50
268
+ TEMP_NAME: logit_scale_caption
269
+ LOSSES:
270
+ NAMES: ['CrossEntropy', 'Accuracy']
271
+ LOSS_WEIGHT: 1.0
272
+ REDUCTION: 'mean'
273
+ INFERENCE:
274
+ VOCAB: 'CLIP'
275
+ GENERATION_MODE: False
276
+
277
+ -
278
+ NAME: vg_caption
279
+ DATASETS:
280
+ TRAIN: 'ImageTextPairDataset'
281
+ TASK_TYPE: 'image_caption'
282
+ DATASET_NAME: 'VG'
283
+ TARGET_SET: ['Vocab_Word']
284
+ DATALOADER:
285
+ TRAIN_BATCH_SIZE: 300
286
+ TEST_BATCH_SIZE: 32
287
+ NUM_WORKERS: 2
288
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
289
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
290
+ S3_PATH: 's3://visual_genome/images'
291
+ ANNO_FILENAME: 'vg_captions_128filter.json'
292
+ SEQ_PER_SAMPLE: 1
293
+ CACHE_MODE: True
294
+ CIRCULAR_CACHE_MODE: False
295
+ ZIP_MODE: False
296
+ CACHE_ORIGIN_IMAGE: False
297
+ RANDOM_CAPTION: False
298
+ AS_NUMPY_AS_POSSIBLE: False
299
+ SAMPLING_WEIGHT: 0.1766
300
+ TRANSFORM: 'clip_transforms'
301
+ MODEL:
302
+ MAX_SEQ_LEN: 30
303
+ TEMP_NAME: logit_scale_caption
304
+ LOSSES:
305
+ NAMES: ['CrossEntropy', 'Accuracy']
306
+ LOSS_WEIGHT: 1.0
307
+ REDUCTION: 'mean'
308
+ INFERENCE:
309
+ VOCAB: 'CLIP'
310
+ GENERATION_MODE: True
311
+
312
+
313
+ -
314
+ NAME: mscoco_caption
315
+ DATASETS:
316
+ TRAIN: 'ImageTextPairDataset'
317
+ # VAL: 'ImageTextPairDataset'
318
+ # TEST: 'ImageTextPairDataset'
319
+ TASK_TYPE: 'image_caption'
320
+ DATASET_NAME: 'MSCOCO'
321
+ TARGET_SET: ['Vocab_Word']
322
+ DATALOADER:
323
+ TRAIN_BATCH_SIZE: 300
324
+ TEST_BATCH_SIZE: 32
325
+ NUM_WORKERS: 1
326
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
327
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
328
+ S3_PATH: 's3://coco/'
329
+ SEQ_PER_SAMPLE: 1
330
+ CACHE_MODE: True
331
+ CIRCULAR_CACHE_MODE: False
332
+ ZIP_MODE: False
333
+ CACHE_ORIGIN_IMAGE: False
334
+ RANDOM_CAPTION: False
335
+ AS_NUMPY_AS_POSSIBLE: False
336
+ SAMPLING_WEIGHT: 0.1144
337
+ TRANSFORM: 'clip_transforms'
338
+ RANDOM_MASK: True
339
+ MODEL:
340
+ MAX_SEQ_LEN: 50
341
+ EVAL_MAX_SEQ_LEN: 21
342
+ TEMP_NAME: logit_scale_caption
343
+ LOSSES:
344
+ NAMES: ['CrossEntropy', 'Accuracy']
345
+ LOSS_WEIGHT: 1.0
346
+ REDUCTION: 'mean'
347
+ DECODE_STRATEGY:
348
+ NAME: 'CaptionBeamSearcherV3'
349
+ BEAM_SIZE: 2
350
+ # LEN_PENALTY: 1.0
351
+ INFERENCE:
352
+ NAME: 'COCOEvaler'
353
+ VOCAB: 'CLIP'
354
+ ID_KEY: 'image_id'
355
+ VALUE: 'caption'
356
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
357
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
358
+ GENERATION_MODE: True
359
+
360
+ -
361
+ NAME: sbu_caption
362
+ DATASETS:
363
+ TRAIN: 'ImageTextPairDataset'
364
+ TASK_TYPE: 'image_caption'
365
+ DATASET_NAME: 'SBU'
366
+ TARGET_SET: ['Vocab_Word']
367
+ DATALOADER:
368
+ TRAIN_BATCH_SIZE: 300
369
+ TEST_BATCH_SIZE: 32
370
+ NUM_WORKERS: 1
371
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
372
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
373
+ ANNO_FILENAME: 'subcaption.json'
374
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
375
+ S3_PATH: 's3://SBU/images'
376
+ SEQ_PER_SAMPLE: 1
377
+ SAMPLER: NodeDistributed
378
+ CACHE_MODE: True
379
+ CIRCULAR_CACHE_MODE: False
380
+ ZIP_MODE: False
381
+ CACHE_ORIGIN_IMAGE: False
382
+ RANDOM_CAPTION: False
383
+ AS_NUMPY_AS_POSSIBLE: False
384
+ SAMPLING_WEIGHT: 0.1383
385
+ TRANSFORM: 'clip_transforms'
386
+ MODEL:
387
+ MAX_SEQ_LEN: 50
388
+ TEMP_NAME: logit_scale_caption
389
+ LOSSES:
390
+ NAMES: ['CrossEntropy', 'Accuracy']
391
+ LOSS_WEIGHT: 1.0
392
+ REDUCTION: 'mean'
393
+ INFERENCE:
394
+ VOCAB: 'CLIP'
395
+ GENERATION_MODE: False
396
+
397
+ -
398
+ NAME: yfcc_retrieve
399
+ DATASETS:
400
+ TRAIN: 'ImageTextPairDataset'
401
+ TASK_TYPE: 'image_retrieval'
402
+ DATASET_NAME: 'YFCC'
403
+ DATALOADER:
404
+ TRAIN_BATCH_SIZE: 512
405
+ TEST_BATCH_SIZE: 32
406
+ NUM_WORKERS: 2
407
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
408
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
409
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
410
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
411
+ S3_PATH: 'cluster2:s3://yfcc/'
412
+ SAMPLER: NodeDistributed
413
+ CACHE_MODE: True
414
+ CIRCULAR_CACHE_MODE: False
415
+ ZIP_MODE: False
416
+ CACHE_ORIGIN_IMAGE: False
417
+ RANDOM_CAPTION: True
418
+ AS_NUMPY_AS_POSSIBLE: False
419
+ SAMPLING_WEIGHT: 0.5840
420
+ TRANSFORM: 'clip_transforms'
421
+ MODEL:
422
+ MAX_SEQ_LEN: 50
423
+ TEMP_NAME: logit_scale_retrieve
424
+ LOSSES:
425
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
426
+ LABELSMOOTHING: 0.1
427
+ LOSS_WEIGHT: 0.5
428
+ REDUCTION: 'mean'
429
+ INFERENCE:
430
+ VOCAB: 'CLIP'
431
+ GENERATION_MODE: False
432
+
433
+ -
434
+ NAME: cc12m_retrieve
435
+ DATASETS:
436
+ TRAIN: 'ImageTextPairDataset'
437
+ TASK_TYPE: 'image_retrieval'
438
+ DATASET_NAME: 'CC12M'
439
+ DATALOADER:
440
+ TRAIN_BATCH_SIZE: 512
441
+ TEST_BATCH_SIZE: 32
442
+ NUM_WORKERS: 2
443
+ S3_ANNO_FOLDER: 's3://cc12m/'
444
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
445
+ ANNO_FILENAME: 'train_available.json'
446
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
447
+ S3_PATH: 's3://cc12m/'
448
+ SAMPLER: NodeDistributed
449
+ CACHE_MODE: True
450
+ CIRCULAR_CACHE_MODE: False
451
+ ZIP_MODE: False
452
+ CACHE_ORIGIN_IMAGE: False
453
+ RANDOM_CAPTION: False
454
+ AS_NUMPY_AS_POSSIBLE: False
455
+ SAMPLING_WEIGHT: 0.5057
456
+ TRANSFORM: 'clip_transforms'
457
+ MODEL:
458
+ MAX_SEQ_LEN: 50
459
+ TEMP_NAME: logit_scale_retrieve
460
+ LOSSES:
461
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
462
+ LABELSMOOTHING: 0.1
463
+ LOSS_WEIGHT: 0.5
464
+ REDUCTION: 'mean'
465
+ INFERENCE:
466
+ VOCAB: 'CLIP'
467
+ GENERATION_MODE: False
468
+
469
+ -
470
+ NAME: cc3m_retrieve
471
+ DATASETS:
472
+ TRAIN: 'ImageTextPairDataset'
473
+ TASK_TYPE: 'image_retrieval'
474
+ DATASET_NAME: 'CC3M'
475
+ DATALOADER:
476
+ TRAIN_BATCH_SIZE: 512
477
+ TEST_BATCH_SIZE: 32
478
+ NUM_WORKERS: 2
479
+ S3_ANNO_FOLDER: 's3://cc3m/'
480
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
481
+ ANNO_FILENAME: 'train_spacy.json'
482
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
483
+ S3_PATH: 's3://cc3m/'
484
+ SAMPLER: NodeDistributed
485
+ CACHE_MODE: True
486
+ CIRCULAR_CACHE_MODE: False
487
+ ZIP_MODE: False
488
+ CACHE_ORIGIN_IMAGE: False
489
+ RANDOM_CAPTION: False
490
+ AS_NUMPY_AS_POSSIBLE: False
491
+ SAMPLING_WEIGHT: 0.26295
492
+ TRANSFORM: 'clip_transforms'
493
+ MODEL:
494
+ MAX_SEQ_LEN: 50
495
+ TEMP_NAME: logit_scale_retrieve
496
+ LOSSES:
497
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
498
+ LABELSMOOTHING: 0.1
499
+ LOSS_WEIGHT: 0.5
500
+ REDUCTION: 'mean'
501
+ INFERENCE:
502
+ VOCAB: 'CLIP'
503
+ GENERATION_MODE: False
504
+
505
+ -
506
+ NAME: vg_retrieve
507
+ DATASETS:
508
+ TRAIN: 'ImageTextPairDataset'
509
+ TASK_TYPE: 'image_retrieval'
510
+ DATASET_NAME: 'VG'
511
+ DATALOADER:
512
+ TRAIN_BATCH_SIZE: 512
513
+ TEST_BATCH_SIZE: 32
514
+ NUM_WORKERS: 2
515
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
516
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
517
+ S3_PATH: 's3://visual_genome/images'
518
+ ANNO_FILENAME: 'vg_captions_128filter.json'
519
+ SEQ_PER_SAMPLE: 1
520
+ CACHE_MODE: True
521
+ CIRCULAR_CACHE_MODE: False
522
+ ZIP_MODE: False
523
+ CACHE_ORIGIN_IMAGE: False
524
+ RANDOM_CAPTION: False
525
+ AS_NUMPY_AS_POSSIBLE: False
526
+ SAMPLING_WEIGHT: 0.1766
527
+ TRANSFORM: 'clip_transforms'
528
+ MODEL:
529
+ MAX_SEQ_LEN: 30
530
+ TEMP_NAME: logit_scale_retrieve
531
+ LOSSES:
532
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
533
+ LABELSMOOTHING: 0.1
534
+ LOSS_WEIGHT: 0.5
535
+ REDUCTION: 'mean'
536
+ INFERENCE:
537
+ VOCAB: 'CLIP'
538
+ GENERATION_MODE: False
539
+
540
+ -
541
+ NAME: mscoco_retrieve
542
+ DATASETS:
543
+ TRAIN: 'ImageTextPairDataset'
544
+ # TEST: 'ImageTextPairDataset'
545
+ TASK_TYPE: 'image_retrieval'
546
+ DATASET_NAME: 'MSCOCO'
547
+ DATALOADER:
548
+ TRAIN_BATCH_SIZE: 512
549
+ TEST_BATCH_SIZE: 32
550
+ NUM_WORKERS: 1
551
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
552
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
553
+ S3_PATH: 's3://coco/'
554
+ SEQ_PER_SAMPLE: 1
555
+ CACHE_MODE: True
556
+ CIRCULAR_CACHE_MODE: False
557
+ ZIP_MODE: False
558
+ CACHE_ORIGIN_IMAGE: False
559
+ RANDOM_CAPTION: False
560
+ AS_NUMPY_AS_POSSIBLE: False
561
+ SAMPLING_WEIGHT: 0.1144
562
+ TRANSFORM: 'clip_transforms'
563
+ MODEL:
564
+ MAX_SEQ_LEN: 50
565
+ TEMP_NAME: logit_scale_retrieve
566
+ LOSSES:
567
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
568
+ LABELSMOOTHING: 0.1
569
+ LOSS_WEIGHT: 0.5
570
+ REDUCTION: 'mean'
571
+ INFERENCE:
572
+ VOCAB: 'CLIP'
573
+ ID_KEY: 'image_id'
574
+ VALUE: 'caption'
575
+ NAME: 'RetrievalEvaler'
576
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
577
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
578
+ GENERATION_MODE: False
579
+
580
+ -
581
+ NAME: sbu_retrieve
582
+ DATASETS:
583
+ TRAIN: 'ImageTextPairDataset'
584
+ TASK_TYPE: 'image_retrieval'
585
+ DATASET_NAME: 'SBU'
586
+ DATALOADER:
587
+ TRAIN_BATCH_SIZE: 512
588
+ TEST_BATCH_SIZE: 32
589
+ NUM_WORKERS: 1
590
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
591
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
592
+ ANNO_FILENAME: 'subcaption.json'
593
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
594
+ S3_PATH: 's3://SBU/images'
595
+ SAMPLER: NodeDistributed
596
+ CACHE_MODE: True
597
+ CIRCULAR_CACHE_MODE: False
598
+ ZIP_MODE: False
599
+ CACHE_ORIGIN_IMAGE: False
600
+ RANDOM_CAPTION: False
601
+ AS_NUMPY_AS_POSSIBLE: False
602
+ SAMPLING_WEIGHT: 0.1383
603
+ TRANSFORM: 'clip_transforms'
604
+ MODEL:
605
+ MAX_SEQ_LEN: 50
606
+ TEMP_NAME: logit_scale_retrieve
607
+ LOSSES:
608
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
609
+ LABELSMOOTHING: 0.1
610
+ LOSS_WEIGHT: 0.5
611
+ REDUCTION: 'mean'
612
+ INFERENCE:
613
+ VOCAB: 'CLIP'
614
+ GENERATION_MODE: False
615
+
616
+
617
+ ENGINE:
618
+ NAME: 'UnifiedTrainer'
619
+
620
+ MODEL:
621
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
622
+ ENCODER: 'UnifiedBertEncoder'
623
+
624
+
625
+ SHARE_LAYERNORM: True
626
+ BERT:
627
+ NORMALIZE_DECISION: "BERTPre"
628
+ DROP_PATH_PROB: 0.1
629
+ DROP_PATH_PROB_FIXED: True
630
+
631
+ UNIFY_QKV: True
632
+
633
+ MODEL_EMA: False
634
+ MODEL_EMA_DECAY: 0.9999
635
+
636
+ MAEParamsInit: True
637
+ POSEMBEDFIX: True
638
+
639
+
640
+ IMG_INPUT_SIZE: 160
641
+ PATCH_SIZE: 16
642
+
643
+ LAYER_SCALE: True
644
+ LAYER_SCALE_INIT: 1e-3
645
+ OLD_CHECKPONT: True
646
+
647
+ LAYER_SCALE_FP32: True
648
+ GATE_FP32: False
649
+ TAG_TRANSFORM_FP32: False
650
+
651
+
652
+ DATALOADER:
653
+ USE_WEIGHTED_SAMPLER: True
654
+ UNIFIED_DATASET: True
655
+ NUM_WORKERS: 32
656
+
657
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
658
+
659
+
660
+
661
+ ####################################### Optimizer #######################################
662
+ SOLVER:
663
+ NAME: 'Adam'
664
+ TORCH_OPTIMIZER: True
665
+ PARAMS_SEPERATE: True
666
+ # PARAMS_GROUP: True
667
+ # EPOCH: 1
668
+ MAX_ITER: 230000
669
+ CHECKPOINT_PERIOD: 10000
670
+ EVAL_PERIOD: 10000000
671
+ BASE_LR: 0.001
672
+ BIAS_LR_FACTOR: 1.0
673
+ WEIGHT_DECAY: 0.2
674
+ WEIGHT_DECAY_NORM: 0.0
675
+ WEIGHT_DECAY_BIAS: 0.0
676
+ WEIGHT_DECAY_EMBEDDING: 0.0
677
+ MOMENTUM: 0.9
678
+ DAMPENING: 0.0
679
+ NESTEROV: 0.0
680
+ BETAS: [0.9, 0.95]
681
+ EPS: 1e-6
682
+ GRAD_CLIP: 0.1
683
+ GRAD_CLIP_TYPE: 'norm'
684
+ ACCUM_ITER: 0
685
+ AMP_FP16: True
686
+ APEX_FP16: False # dangerous
687
+
688
+ WRITE_PERIOD: 50
689
+ MIN_LOSS_SCLE: 2048.0
690
+ # BF16: False # True
691
+ # ZEROSTAGE: 2
692
+
693
+ LOSS_SCALE_WINDOW: 200
694
+
695
+
696
+ ####################################### lr scheduler #######################################
697
+ LR_SCHEDULER:
698
+ NAME: 'WarmupCosine'
699
+ WARMUP: 10000
700
+ MIN_LR: 0.000001
701
+
702
+ ####################################### evaluation #######################################
703
+ INFERENCE:
704
+
705
+ VOCAB: 'CLIP'
706
+ ITER_BASED: True
707
+
708
+
709
+ find_unused_parameters: true
710
+
711
+ MOE:
712
+ MOE: True
713
+ MOE_TYPE: 'attribute'
714
+ TAG_Transform: True
715
+ ATTRIBUTE_LENGTH: 8
716
+ EP_WORLD_SIZE: 1 # tag moe only
717
+ NUM_EXPERTS: 8
718
+ TOP_K: 2
719
+ CAPACITY_FACTOR: 3.0
720
+ EVAL_MIN_CAPACITY: 4.0
721
+ MIN_CAPACITY: 4
722
+ NOISY_GATE_POLICY: 'vmoe'
723
+ MOE_PARAM_GROUP: True
724
+ MOE_EXPERT_TYPE: 'FFN,SA'
725
+ SA_LINEAR_OUT_MOE: True
726
+ MOE_EXPERT_LOCATION: 'odd' # 'odd'
727
+ # MOE_LAYER_START_IDX: 3
728
+ # MOE_LAYER_END_IDX: 21
729
+ # MOE_LAYER_START_IDX: 18
730
+ # MOE_LAYER_END_IDX: 12
731
+ BATCH_PRIO: True
732
+ USE_TUTEL: True
733
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H768_experiments/16tasks_training_basemoe_stage2_56gpu.yaml ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet22k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ -
18
+ NAME: 'MomentsInTime'
19
+ SHARED_TARGETS_CFG:
20
+ FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
21
+ DISTRIBUTED: False
22
+
23
+ -
24
+ NAME: 'Kinetics700'
25
+ SHARED_TARGETS_CFG:
26
+ FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
27
+ DISTRIBUTED: False
28
+
29
+ TASKS:
30
+
31
+ -
32
+ NAME: imagenet22k
33
+ DATASETS:
34
+ TRAIN: 'ImageNet22KDataset'
35
+ TASK_TYPE: 'image_classification'
36
+ DATASET_NAME: 'ImageNet22k'
37
+ TARGET_SET: ['ImageNet22k']
38
+
39
+ DATALOADER:
40
+ TRAIN_BATCH_SIZE: 440
41
+ # TEST_BATCH_SIZE: 2
42
+ NUM_WORKERS: 2
43
+ FEATS_FOLDER: 'open_source_dataset/imagenet22k'
44
+ S3_PATH: 'cluster2:s3://imagenet22k'
45
+ ANNO_FOLDER: 'open_source_dataset/'
46
+ SAMPLING_WEIGHT: 2.486
47
+ MIXUP: 0.0
48
+ CUTMIX: 0.0
49
+ MIXUP_PROB: 1.0
50
+ MIXUP_SWITCH_PROB: 0.5
51
+ MIXUP_MODE: 'batch'
52
+ MIXUP_LABEL_SMOOTHING: 0.1
53
+ MODEL:
54
+ MAX_SEQ_LEN: -1
55
+ LABELS_NUM: 21842
56
+ TEMP_NAME: logit_scale_img_cls
57
+ LOSSES:
58
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
59
+ LOSS_WEIGHT: 1.0
60
+ REDUCTION: 'mean'
61
+ LABELSMOOTHING: 0.1
62
+ INFERENCE:
63
+ NAME: 'ImageNetEvaler'
64
+ ID_KEY: 'image_id'
65
+ VALUE: 'cls_logits'
66
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
67
+ # VAL_ANNFILE: '/mnt/lustrenew/lihao2/projects/xmodaler_2/val_debug.txt'
68
+ TEST_ANNFILE: ''
69
+ GENERATION_MODE: False
70
+
71
+ -
72
+ NAME: K700_retrieve
73
+ DATASETS:
74
+ TRAIN: 'VideoDataSet'
75
+ TASK_TYPE: 'video_classification'
76
+ DATASET_NAME: 'K700'
77
+ TARGET_SET: ['Kinetics700']
78
+ DATALOADER:
79
+ TRAIN_BATCH_SIZE: 12
80
+ TEST_BATCH_SIZE: 24
81
+ NUM_WORKERS: 2
82
+ FEATS_FOLDER: 'open_source_dataset/K700'
83
+ ANNO_FOLDER: 'open_source_dataset/K700'
84
+ S3_PATH: 's3://K700/'
85
+ FRAMES_PER_CLIP: 8
86
+ STRIDE: 32
87
+ FILE_EXTENSION: ''
88
+ ANNO_FILE: 'annotation.json'
89
+ TIMESFORMER_AUG: True
90
+ SAMPLING_WEIGHT: 1.0
91
+
92
+ MODEL:
93
+ MAX_SEQ_LEN: -1
94
+ TEMP_NAME: logit_scale_video_cls
95
+ LOSSES:
96
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
97
+ LABELSMOOTHING: 0.1
98
+ LOSS_WEIGHT: 0.1
99
+ INFERENCE:
100
+ VOCAB: 'CLIP'
101
+ GENERATION_MODE: False
102
+
103
+ -
104
+ NAME: MomentsInTime
105
+ DATASETS:
106
+ TRAIN: 'VideoDataSet'
107
+ TASK_TYPE: 'video_classification'
108
+ DATASET_NAME: 'MiT'
109
+ TARGET_SET: ['MomentsInTime']
110
+ DATALOADER:
111
+ TRAIN_BATCH_SIZE: 68
112
+ TEST_BATCH_SIZE: 8
113
+ NUM_WORKERS: 2
114
+ FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
115
+ ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
116
+ S3_PATH: 's3://MomentsInTime/'
117
+ FRAMES_PER_CLIP: 3
118
+ STRIDE: 32
119
+ FILE_EXTENSION: ''
120
+ ANNO_FILE: 'annotation.json'
121
+ TIMESFORMER_AUG: True
122
+ SAMPLING_WEIGHT: 0.2
123
+
124
+ MODEL:
125
+ MAX_SEQ_LEN: -1
126
+ TEMP_NAME: logit_scale_video_cls
127
+ LOSSES:
128
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
129
+ LABELSMOOTHING: 0.1
130
+ LOSS_WEIGHT: 0.1
131
+ INFERENCE:
132
+ NAME: 'MiTEvaler'
133
+ ID_KEY: 'video_name'
134
+ VALUE: 'label'
135
+ VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
136
+ TEST_ANNFILE: ''
137
+ GENERATION_MODE: False
138
+ NUM_VIEWS: 1
139
+
140
+ -
141
+ NAME: bookswiki_pretrain
142
+ DATASETS:
143
+ TRAIN: 'GeneralCorpusDataset'
144
+ TASK_TYPE: 'text_mlm'
145
+ DATASET_NAME: 'BooksWiki'
146
+ TARGET_SET: ['Vocab_Word']
147
+ VERSION: 'v2'
148
+ DATALOADER:
149
+ TRAIN_BATCH_SIZE: 512
150
+ TEST_BATCH_SIZE: 32
151
+ NUM_WORKERS: 2
152
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
153
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
154
+ SEQ_PER_SAMPLE: 1
155
+ SAMPLER: NodeDistributed
156
+ CACHE_MODE: True
157
+ SEQ_PER_SAMPLE: 128
158
+ MIN_SEQ_PER_SAMPLE: 128
159
+ APPEND_EOS: True
160
+ ONE_STREAM: False
161
+ SAMPLING_WEIGHT: 2.75
162
+ RANDOM_MASK: True
163
+ MODEL:
164
+ MAX_SEQ_LEN: 128
165
+ TEMP_NAME: logit_scale_text_mlm
166
+ LOSSES:
167
+ NAMES: ['CrossEntropy', 'Accuracy']
168
+ LOSS_WEIGHT: 0.5
169
+ REDUCTION: 'mean'
170
+ INFERENCE:
171
+ VOCAB: 'CLIP'
172
+ GENERATION_MODE: False
173
+
174
+
175
+ -
176
+ NAME: yfcc_caption
177
+ DATASETS:
178
+ TRAIN: 'ImageTextPairDataset'
179
+ TASK_TYPE: 'image_caption'
180
+ DATASET_NAME: 'YFCC'
181
+ TARGET_SET: ['Vocab_Word']
182
+ DATALOADER:
183
+ TRAIN_BATCH_SIZE: 200
184
+ TEST_BATCH_SIZE: 32
185
+ NUM_WORKERS: 2
186
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
187
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
188
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
189
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
190
+ S3_PATH: 'cluster2:s3://yfcc/'
191
+ SEQ_PER_SAMPLE: 1
192
+ SAMPLER: NodeDistributed
193
+ CACHE_MODE: True
194
+ CIRCULAR_CACHE_MODE: False
195
+ ZIP_MODE: False
196
+ CACHE_ORIGIN_IMAGE: False
197
+ RANDOM_CAPTION: False
198
+ AS_NUMPY_AS_POSSIBLE: False
199
+ SAMPLING_WEIGHT: 0.5840
200
+ TRANSFORM: 'clip_transforms'
201
+ MODEL:
202
+ MAX_SEQ_LEN: 50
203
+ TEMP_NAME: logit_scale_caption
204
+ LOSSES:
205
+ NAMES: ['CrossEntropy', 'Accuracy']
206
+ LOSS_WEIGHT: 1.0
207
+ REDUCTION: 'mean'
208
+ INFERENCE:
209
+ VOCAB: 'CLIP'
210
+ GENERATION_MODE: False
211
+
212
+ -
213
+ NAME: cc12m_caption
214
+ DATASETS:
215
+ TRAIN: 'ImageTextPairDataset'
216
+ TASK_TYPE: 'image_caption'
217
+ DATASET_NAME: 'CC12M'
218
+ TARGET_SET: ['Vocab_Word']
219
+ DATALOADER:
220
+ TRAIN_BATCH_SIZE: 200
221
+ TEST_BATCH_SIZE: 32
222
+ NUM_WORKERS: 2
223
+ S3_ANNO_FOLDER: 's3://cc12m/'
224
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
225
+ ANNO_FILENAME: 'train_available.json'
226
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
227
+ S3_PATH: 's3://cc12m/'
228
+ SEQ_PER_SAMPLE: 1
229
+ SAMPLER: NodeDistributed
230
+ CACHE_MODE: True
231
+ CIRCULAR_CACHE_MODE: False
232
+ ZIP_MODE: False
233
+ CACHE_ORIGIN_IMAGE: False
234
+ RANDOM_CAPTION: False
235
+ AS_NUMPY_AS_POSSIBLE: False
236
+ SAMPLING_WEIGHT: 0.5057
237
+ TRANSFORM: 'clip_transforms'
238
+ MODEL:
239
+ MAX_SEQ_LEN: 50
240
+ TEMP_NAME: logit_scale_caption
241
+ LOSSES:
242
+ NAMES: ['CrossEntropy', 'Accuracy']
243
+ LOSS_WEIGHT: 1.0
244
+ REDUCTION: 'mean'
245
+ INFERENCE:
246
+ VOCAB: 'CLIP'
247
+ GENERATION_MODE: False
248
+
249
+ -
250
+ NAME: cc3m_caption
251
+ DATASETS:
252
+ TRAIN: 'ImageTextPairDataset'
253
+ TASK_TYPE: 'image_caption'
254
+ DATASET_NAME: 'CC3M'
255
+ TARGET_SET: ['Vocab_Word']
256
+ DATALOADER:
257
+ TRAIN_BATCH_SIZE: 200
258
+ TEST_BATCH_SIZE: 32
259
+ NUM_WORKERS: 2
260
+ S3_ANNO_FOLDER: 's3://cc3m/'
261
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
262
+ ANNO_FILENAME: 'train_spacy.json'
263
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
264
+ S3_PATH: 's3://cc3m/'
265
+ SEQ_PER_SAMPLE: 1
266
+ SAMPLER: NodeDistributed
267
+ CACHE_MODE: True
268
+ CIRCULAR_CACHE_MODE: False
269
+ ZIP_MODE: False
270
+ CACHE_ORIGIN_IMAGE: False
271
+ RANDOM_CAPTION: False
272
+ AS_NUMPY_AS_POSSIBLE: False
273
+ SAMPLING_WEIGHT: 0.26295
274
+ TRANSFORM: 'clip_transforms'
275
+ MODEL:
276
+ MAX_SEQ_LEN: 50
277
+ TEMP_NAME: logit_scale_caption
278
+ LOSSES:
279
+ NAMES: ['CrossEntropy', 'Accuracy']
280
+ LOSS_WEIGHT: 1.0
281
+ REDUCTION: 'mean'
282
+ INFERENCE:
283
+ VOCAB: 'CLIP'
284
+ GENERATION_MODE: False
285
+
286
+ -
287
+ NAME: vg_caption
288
+ DATASETS:
289
+ TRAIN: 'ImageTextPairDataset'
290
+ TASK_TYPE: 'image_caption'
291
+ DATASET_NAME: 'VG'
292
+ TARGET_SET: ['Vocab_Word']
293
+ DATALOADER:
294
+ TRAIN_BATCH_SIZE: 200
295
+ TEST_BATCH_SIZE: 32
296
+ NUM_WORKERS: 2
297
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
298
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
299
+ S3_PATH: 's3://visual_genome/images'
300
+ ANNO_FILENAME: 'vg_captions_128filter.json'
301
+ SEQ_PER_SAMPLE: 1
302
+ CACHE_MODE: True
303
+ CIRCULAR_CACHE_MODE: False
304
+ ZIP_MODE: False
305
+ CACHE_ORIGIN_IMAGE: False
306
+ RANDOM_CAPTION: False
307
+ AS_NUMPY_AS_POSSIBLE: False
308
+ SAMPLING_WEIGHT: 0.1766
309
+ TRANSFORM: 'clip_transforms'
310
+ MODEL:
311
+ MAX_SEQ_LEN: 30
312
+ TEMP_NAME: logit_scale_caption
313
+ LOSSES:
314
+ NAMES: ['CrossEntropy', 'Accuracy']
315
+ LOSS_WEIGHT: 1.0
316
+ REDUCTION: 'mean'
317
+ INFERENCE:
318
+ VOCAB: 'CLIP'
319
+ GENERATION_MODE: True
320
+
321
+
322
+ -
323
+ NAME: mscoco_caption
324
+ DATASETS:
325
+ TRAIN: 'ImageTextPairDataset'
326
+ # VAL: 'ImageTextPairDataset'
327
+ # TEST: 'ImageTextPairDataset'
328
+ TASK_TYPE: 'image_caption'
329
+ DATASET_NAME: 'MSCOCO'
330
+ TARGET_SET: ['Vocab_Word']
331
+ DATALOADER:
332
+ TRAIN_BATCH_SIZE: 200
333
+ TEST_BATCH_SIZE: 32
334
+ NUM_WORKERS: 1
335
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
336
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
337
+ S3_PATH: 's3://coco/'
338
+ SEQ_PER_SAMPLE: 1
339
+ CACHE_MODE: True
340
+ CIRCULAR_CACHE_MODE: False
341
+ ZIP_MODE: False
342
+ CACHE_ORIGIN_IMAGE: False
343
+ RANDOM_CAPTION: False
344
+ AS_NUMPY_AS_POSSIBLE: False
345
+ SAMPLING_WEIGHT: 0.1144
346
+ TRANSFORM: 'clip_transforms'
347
+ RANDOM_MASK: True
348
+ MODEL:
349
+ MAX_SEQ_LEN: 50
350
+ EVAL_MAX_SEQ_LEN: 21
351
+ TEMP_NAME: logit_scale_caption
352
+ LOSSES:
353
+ NAMES: ['CrossEntropy', 'Accuracy']
354
+ LOSS_WEIGHT: 1.0
355
+ REDUCTION: 'mean'
356
+ DECODE_STRATEGY:
357
+ NAME: 'CaptionBeamSearcherV3'
358
+ BEAM_SIZE: 2
359
+ # LEN_PENALTY: 1.0
360
+ INFERENCE:
361
+ NAME: 'COCOEvaler'
362
+ VOCAB: 'CLIP'
363
+ ID_KEY: 'image_id'
364
+ VALUE: 'caption'
365
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
366
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
367
+ GENERATION_MODE: True
368
+
369
+ -
370
+ NAME: sbu_caption
371
+ DATASETS:
372
+ TRAIN: 'ImageTextPairDataset'
373
+ TASK_TYPE: 'image_caption'
374
+ DATASET_NAME: 'SBU'
375
+ TARGET_SET: ['Vocab_Word']
376
+ DATALOADER:
377
+ TRAIN_BATCH_SIZE: 200
378
+ TEST_BATCH_SIZE: 32
379
+ NUM_WORKERS: 1
380
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
381
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
382
+ ANNO_FILENAME: 'subcaption.json'
383
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
384
+ S3_PATH: 's3://SBU/images'
385
+ SEQ_PER_SAMPLE: 1
386
+ SAMPLER: NodeDistributed
387
+ CACHE_MODE: True
388
+ CIRCULAR_CACHE_MODE: False
389
+ ZIP_MODE: False
390
+ CACHE_ORIGIN_IMAGE: False
391
+ RANDOM_CAPTION: False
392
+ AS_NUMPY_AS_POSSIBLE: False
393
+ SAMPLING_WEIGHT: 0.1383
394
+ TRANSFORM: 'clip_transforms'
395
+ MODEL:
396
+ MAX_SEQ_LEN: 50
397
+ TEMP_NAME: logit_scale_caption
398
+ LOSSES:
399
+ NAMES: ['CrossEntropy', 'Accuracy']
400
+ LOSS_WEIGHT: 1.0
401
+ REDUCTION: 'mean'
402
+ INFERENCE:
403
+ VOCAB: 'CLIP'
404
+ GENERATION_MODE: False
405
+
406
+ -
407
+ NAME: yfcc_retrieve
408
+ DATASETS:
409
+ TRAIN: 'ImageTextPairDataset'
410
+ TASK_TYPE: 'image_retrieval'
411
+ DATASET_NAME: 'YFCC'
412
+ DATALOADER:
413
+ TRAIN_BATCH_SIZE: 320
414
+ TEST_BATCH_SIZE: 32
415
+ NUM_WORKERS: 2
416
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
417
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
418
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
419
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
420
+ S3_PATH: 'cluster2:s3://yfcc/'
421
+ SAMPLER: NodeDistributed
422
+ CACHE_MODE: True
423
+ CIRCULAR_CACHE_MODE: False
424
+ ZIP_MODE: False
425
+ CACHE_ORIGIN_IMAGE: False
426
+ RANDOM_CAPTION: False
427
+ AS_NUMPY_AS_POSSIBLE: False
428
+ SAMPLING_WEIGHT: 0.5840
429
+ TRANSFORM: 'clip_transforms'
430
+ MODEL:
431
+ MAX_SEQ_LEN: 50
432
+ TEMP_NAME: logit_scale_retrieve
433
+ LOSSES:
434
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
435
+ LABELSMOOTHING: 0.1
436
+ LOSS_WEIGHT: 0.5
437
+ REDUCTION: 'mean'
438
+ INFERENCE:
439
+ VOCAB: 'CLIP'
440
+ GENERATION_MODE: False
441
+
442
+ -
443
+ NAME: cc12m_retrieve
444
+ DATASETS:
445
+ TRAIN: 'ImageTextPairDataset'
446
+ TASK_TYPE: 'image_retrieval'
447
+ DATASET_NAME: 'CC12M'
448
+ DATALOADER:
449
+ TRAIN_BATCH_SIZE: 320
450
+ TEST_BATCH_SIZE: 32
451
+ NUM_WORKERS: 2
452
+ S3_ANNO_FOLDER: 's3://cc12m/'
453
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
454
+ ANNO_FILENAME: 'train_available.json'
455
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
456
+ S3_PATH: 's3://cc12m/'
457
+ SAMPLER: NodeDistributed
458
+ CACHE_MODE: True
459
+ CIRCULAR_CACHE_MODE: False
460
+ ZIP_MODE: False
461
+ CACHE_ORIGIN_IMAGE: False
462
+ RANDOM_CAPTION: False
463
+ AS_NUMPY_AS_POSSIBLE: False
464
+ SAMPLING_WEIGHT: 0.5057
465
+ TRANSFORM: 'clip_transforms'
466
+ MODEL:
467
+ MAX_SEQ_LEN: 50
468
+ TEMP_NAME: logit_scale_retrieve
469
+ LOSSES:
470
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
471
+ LABELSMOOTHING: 0.1
472
+ LOSS_WEIGHT: 0.5
473
+ REDUCTION: 'mean'
474
+ INFERENCE:
475
+ VOCAB: 'CLIP'
476
+ GENERATION_MODE: False
477
+
478
+ -
479
+ NAME: cc3m_retrieve
480
+ DATASETS:
481
+ TRAIN: 'ImageTextPairDataset'
482
+ TASK_TYPE: 'image_retrieval'
483
+ DATASET_NAME: 'CC3M'
484
+ DATALOADER:
485
+ TRAIN_BATCH_SIZE: 320
486
+ TEST_BATCH_SIZE: 32
487
+ NUM_WORKERS: 2
488
+ S3_ANNO_FOLDER: 's3://cc3m/'
489
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
490
+ ANNO_FILENAME: 'train_spacy.json'
491
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
492
+ S3_PATH: 's3://cc3m/'
493
+ SAMPLER: NodeDistributed
494
+ CACHE_MODE: True
495
+ CIRCULAR_CACHE_MODE: False
496
+ ZIP_MODE: False
497
+ CACHE_ORIGIN_IMAGE: False
498
+ RANDOM_CAPTION: False
499
+ AS_NUMPY_AS_POSSIBLE: False
500
+ SAMPLING_WEIGHT: 0.26295
501
+ TRANSFORM: 'clip_transforms'
502
+ MODEL:
503
+ MAX_SEQ_LEN: 50
504
+ TEMP_NAME: logit_scale_retrieve
505
+ LOSSES:
506
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
507
+ LABELSMOOTHING: 0.1
508
+ LOSS_WEIGHT: 0.5
509
+ REDUCTION: 'mean'
510
+ INFERENCE:
511
+ VOCAB: 'CLIP'
512
+ GENERATION_MODE: False
513
+
514
+ -
515
+ NAME: vg_retrieve
516
+ DATASETS:
517
+ TRAIN: 'ImageTextPairDataset'
518
+ TASK_TYPE: 'image_retrieval'
519
+ DATASET_NAME: 'VG'
520
+ DATALOADER:
521
+ TRAIN_BATCH_SIZE: 320
522
+ TEST_BATCH_SIZE: 32
523
+ NUM_WORKERS: 2
524
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
525
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
526
+ S3_PATH: 's3://visual_genome/images'
527
+ ANNO_FILENAME: 'vg_captions_128filter.json'
528
+ SEQ_PER_SAMPLE: 1
529
+ CACHE_MODE: True
530
+ CIRCULAR_CACHE_MODE: False
531
+ ZIP_MODE: False
532
+ CACHE_ORIGIN_IMAGE: False
533
+ RANDOM_CAPTION: False
534
+ AS_NUMPY_AS_POSSIBLE: False
535
+ SAMPLING_WEIGHT: 0.1766
536
+ TRANSFORM: 'clip_transforms'
537
+ MODEL:
538
+ MAX_SEQ_LEN: 30
539
+ TEMP_NAME: logit_scale_retrieve
540
+ LOSSES:
541
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
542
+ LABELSMOOTHING: 0.1
543
+ LOSS_WEIGHT: 0.5
544
+ REDUCTION: 'mean'
545
+ INFERENCE:
546
+ VOCAB: 'CLIP'
547
+ GENERATION_MODE: False
548
+
549
+ -
550
+ NAME: mscoco_retrieve
551
+ DATASETS:
552
+ TRAIN: 'ImageTextPairDataset'
553
+ # TEST: 'ImageTextPairDataset'
554
+ TASK_TYPE: 'image_retrieval'
555
+ DATASET_NAME: 'MSCOCO'
556
+ DATALOADER:
557
+ TRAIN_BATCH_SIZE: 320
558
+ TEST_BATCH_SIZE: 32
559
+ NUM_WORKERS: 1
560
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
561
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
562
+ S3_PATH: 's3://coco/'
563
+ SEQ_PER_SAMPLE: 1
564
+ CACHE_MODE: True
565
+ CIRCULAR_CACHE_MODE: False
566
+ ZIP_MODE: False
567
+ CACHE_ORIGIN_IMAGE: False
568
+ RANDOM_CAPTION: False
569
+ AS_NUMPY_AS_POSSIBLE: False
570
+ SAMPLING_WEIGHT: 0.1144
571
+ TRANSFORM: 'clip_transforms'
572
+ MODEL:
573
+ MAX_SEQ_LEN: 50
574
+ TEMP_NAME: logit_scale_retrieve
575
+ LOSSES:
576
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
577
+ LABELSMOOTHING: 0.1
578
+ LOSS_WEIGHT: 0.5
579
+ REDUCTION: 'mean'
580
+ INFERENCE:
581
+ VOCAB: 'CLIP'
582
+ ID_KEY: 'image_id'
583
+ VALUE: 'caption'
584
+ NAME: 'RetrievalEvaler'
585
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
586
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
587
+ GENERATION_MODE: False
588
+
589
+ -
590
+ NAME: sbu_retrieve
591
+ DATASETS:
592
+ TRAIN: 'ImageTextPairDataset'
593
+ TASK_TYPE: 'image_retrieval'
594
+ DATASET_NAME: 'SBU'
595
+ DATALOADER:
596
+ TRAIN_BATCH_SIZE: 320
597
+ TEST_BATCH_SIZE: 32
598
+ NUM_WORKERS: 1
599
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
600
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
601
+ ANNO_FILENAME: 'subcaption.json'
602
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
603
+ S3_PATH: 's3://SBU/images'
604
+ SAMPLER: NodeDistributed
605
+ CACHE_MODE: True
606
+ CIRCULAR_CACHE_MODE: False
607
+ ZIP_MODE: False
608
+ CACHE_ORIGIN_IMAGE: False
609
+ RANDOM_CAPTION: False
610
+ AS_NUMPY_AS_POSSIBLE: False
611
+ SAMPLING_WEIGHT: 0.1383
612
+ TRANSFORM: 'clip_transforms'
613
+ MODEL:
614
+ MAX_SEQ_LEN: 50
615
+ TEMP_NAME: logit_scale_retrieve
616
+ LOSSES:
617
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
618
+ LABELSMOOTHING: 0.1
619
+ LOSS_WEIGHT: 0.5
620
+ REDUCTION: 'mean'
621
+ INFERENCE:
622
+ VOCAB: 'CLIP'
623
+ GENERATION_MODE: False
624
+
625
+
626
+ ENGINE:
627
+ NAME: 'UnifiedTrainer'
628
+
629
+ MODEL:
630
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
631
+ ENCODER: 'UnifiedBertEncoder'
632
+
633
+
634
+ SHARE_LAYERNORM: True
635
+ BERT:
636
+ NORMALIZE_DECISION: "BERTPre"
637
+ DROP_PATH_PROB: 0.1
638
+ DROP_PATH_PROB_FIXED: True
639
+
640
+ UNIFY_QKV: True
641
+
642
+ MODEL_EMA: False
643
+ MODEL_EMA_DECAY: 0.9999
644
+
645
+ MAEParamsInit: True
646
+ POSEMBEDFIX: True
647
+
648
+
649
+ IMG_INPUT_SIZE: 224
650
+ PATCH_SIZE: 16
651
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
652
+ CHECKPOINT_FILETER: False
653
+
654
+ LAYER_SCALE: True
655
+ LAYER_SCALE_INIT: 1e-3
656
+ OLD_CHECKPONT: True
657
+
658
+ LAYER_SCALE_FP32: True
659
+ GATE_FP32: False
660
+ TAG_TRANSFORM_FP32: False
661
+
662
+
663
+ DATALOADER:
664
+ USE_WEIGHTED_SAMPLER: True
665
+ UNIFIED_DATASET: True
666
+ NUM_WORKERS: 32
667
+
668
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
669
+
670
+
671
+
672
+ ####################################### Optimizer #######################################
673
+ SOLVER:
674
+ NAME: 'Adam'
675
+ TORCH_OPTIMIZER: True
676
+ PARAMS_SEPERATE: True
677
+ # PARAMS_GROUP: True
678
+ # EPOCH: 1
679
+ MAX_ITER: 50000
680
+ CHECKPOINT_PERIOD: 5000
681
+ EVAL_PERIOD: 10000000
682
+ BASE_LR: 0.00002
683
+ BIAS_LR_FACTOR: 1.0
684
+ WEIGHT_DECAY: 0.05
685
+ WEIGHT_DECAY_NORM: 0.0
686
+ WEIGHT_DECAY_BIAS: 0.0
687
+ WEIGHT_DECAY_EMBEDDING: 0.0
688
+ MOMENTUM: 0.9
689
+ DAMPENING: 0.0
690
+ NESTEROV: 0.0
691
+ BETAS: [0.9, 0.95]
692
+ EPS: 1e-6
693
+ GRAD_CLIP: 0.1
694
+ GRAD_CLIP_TYPE: 'norm'
695
+ ACCUM_ITER: 0
696
+ AMP_FP16: True
697
+ APEX_FP16: False # dangerous
698
+
699
+ WRITE_PERIOD: 50
700
+ MIN_LOSS_SCLE: 2048.0
701
+ # BF16: False # True
702
+ # ZEROSTAGE: 2
703
+
704
+ LOSS_SCALE_WINDOW: 200
705
+
706
+
707
+ ####################################### lr scheduler #######################################
708
+ LR_SCHEDULER:
709
+ NAME: 'WarmupCosine'
710
+ WARMUP: 5000
711
+ MIN_LR: 0.000001
712
+
713
+ ####################################### evaluation #######################################
714
+ INFERENCE:
715
+
716
+ VOCAB: 'CLIP'
717
+ ITER_BASED: True
718
+
719
+
720
+ find_unused_parameters: true
721
+
722
+ MOE:
723
+ MOE: True
724
+ MOE_TYPE: 'attribute'
725
+ TAG_Transform: True
726
+ ATTRIBUTE_LENGTH: 8
727
+ EP_WORLD_SIZE: 1 # tag moe only
728
+ NUM_EXPERTS: 8
729
+ TOP_K: 2
730
+ CAPACITY_FACTOR: 3.0
731
+ EVAL_MIN_CAPACITY: 4.0
732
+ MIN_CAPACITY: 4
733
+ NOISY_GATE_POLICY: 'vmoe'
734
+ MOE_PARAM_GROUP: True
735
+ MOE_EXPERT_TYPE: 'FFN,SA'
736
+ SA_LINEAR_OUT_MOE: True
737
+ MOE_EXPERT_LOCATION: 'odd' # 'odd'
738
+ # MOE_LAYER_START_IDX: 3
739
+ # MOE_LAYER_END_IDX: 21
740
+ # MOE_LAYER_START_IDX: 18
741
+ # MOE_LAYER_END_IDX: 12
742
+ BATCH_PRIO: True
743
+ USE_TUTEL: True
744
+ FFN_SHARE_GATE_DECISION: True
configs/BERT_L12_H768_experiments/16tasks_training_stage2_64gpu_v1.yaml ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet22k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ -
18
+ NAME: 'MomentsInTime'
19
+ SHARED_TARGETS_CFG:
20
+ FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
21
+ DISTRIBUTED: False
22
+
23
+ -
24
+ NAME: 'Kinetics700'
25
+ SHARED_TARGETS_CFG:
26
+ FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
27
+ DISTRIBUTED: False
28
+
29
+ TASKS:
30
+
31
+ -
32
+ NAME: imagenet22k
33
+ DATASETS:
34
+ TRAIN: 'ImageNet22KDataset'
35
+ TASK_TYPE: 'image_classification'
36
+ DATASET_NAME: 'ImageNet22k'
37
+ TARGET_SET: ['ImageNet22k']
38
+
39
+ DATALOADER:
40
+ TRAIN_BATCH_SIZE: 440
41
+ # TEST_BATCH_SIZE: 2
42
+ NUM_WORKERS: 2
43
+ FEATS_FOLDER: 'open_source_dataset/imagenet22k'
44
+ S3_PATH: 'cluster2:s3://imagenet22k'
45
+ ANNO_FOLDER: 'open_source_dataset/'
46
+ SAMPLING_WEIGHT: 2.486
47
+ MIXUP: 0.0
48
+ CUTMIX: 0.0
49
+ MIXUP_PROB: 1.0
50
+ MIXUP_SWITCH_PROB: 0.5
51
+ MIXUP_MODE: 'batch'
52
+ MIXUP_LABEL_SMOOTHING: 0.1
53
+ MODEL:
54
+ MAX_SEQ_LEN: -1
55
+ LABELS_NUM: 21842
56
+ TEMP_NAME: logit_scale_img_cls
57
+ LOSSES:
58
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
59
+ LOSS_WEIGHT: 1.0
60
+ REDUCTION: 'mean'
61
+ LABELSMOOTHING: 0.1
62
+ INFERENCE:
63
+ NAME: 'ImageNetEvaler'
64
+ ID_KEY: 'image_id'
65
+ VALUE: 'cls_logits'
66
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
67
+ # VAL_ANNFILE: '/mnt/lustrenew/lihao2/projects/xmodaler_2/val_debug.txt'
68
+ TEST_ANNFILE: ''
69
+ GENERATION_MODE: False
70
+
71
+ -
72
+ NAME: K700_retrieve
73
+ DATASETS:
74
+ TRAIN: 'VideoDataSet'
75
+ TASK_TYPE: 'video_classification'
76
+ DATASET_NAME: 'K700'
77
+ TARGET_SET: ['Kinetics700']
78
+ DATALOADER:
79
+ TRAIN_BATCH_SIZE: 12
80
+ TEST_BATCH_SIZE: 24
81
+ NUM_WORKERS: 2
82
+ FEATS_FOLDER: 'open_source_dataset/K700'
83
+ ANNO_FOLDER: 'open_source_dataset/K700'
84
+ S3_PATH: 's3://K700/'
85
+ FRAMES_PER_CLIP: 8
86
+ STRIDE: 32
87
+ FILE_EXTENSION: ''
88
+ ANNO_FILE: 'annotation.json'
89
+ TIMESFORMER_AUG: True
90
+ SAMPLING_WEIGHT: 1.0
91
+
92
+ MODEL:
93
+ MAX_SEQ_LEN: -1
94
+ TEMP_NAME: logit_scale_video_cls
95
+ LOSSES:
96
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
97
+ LABELSMOOTHING: 0.1
98
+ LOSS_WEIGHT: 0.1
99
+ INFERENCE:
100
+ VOCAB: 'CLIP'
101
+ GENERATION_MODE: False
102
+
103
+ -
104
+ NAME: MomentsInTime
105
+ DATASETS:
106
+ TRAIN: 'VideoDataSet'
107
+ TASK_TYPE: 'video_classification'
108
+ DATASET_NAME: 'MiT'
109
+ TARGET_SET: ['MomentsInTime']
110
+ DATALOADER:
111
+ TRAIN_BATCH_SIZE: 68
112
+ TEST_BATCH_SIZE: 8
113
+ NUM_WORKERS: 2
114
+ FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
115
+ ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
116
+ S3_PATH: 's3://MomentsInTime/'
117
+ FRAMES_PER_CLIP: 3
118
+ STRIDE: 32
119
+ FILE_EXTENSION: ''
120
+ ANNO_FILE: 'annotation.json'
121
+ TIMESFORMER_AUG: True
122
+ SAMPLING_WEIGHT: 0.2
123
+
124
+ MODEL:
125
+ MAX_SEQ_LEN: -1
126
+ TEMP_NAME: logit_scale_video_cls
127
+ LOSSES:
128
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
129
+ LABELSMOOTHING: 0.1
130
+ LOSS_WEIGHT: 0.1
131
+ INFERENCE:
132
+ NAME: 'MiTEvaler'
133
+ ID_KEY: 'video_name'
134
+ VALUE: 'label'
135
+ VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
136
+ TEST_ANNFILE: ''
137
+ GENERATION_MODE: False
138
+ NUM_VIEWS: 1
139
+
140
+ -
141
+ NAME: bookswiki_pretrain
142
+ DATASETS:
143
+ TRAIN: 'GeneralCorpusDataset'
144
+ TASK_TYPE: 'text_mlm'
145
+ DATASET_NAME: 'BooksWiki'
146
+ TARGET_SET: ['Vocab_Word']
147
+ VERSION: 'v2'
148
+ DATALOADER:
149
+ TRAIN_BATCH_SIZE: 512
150
+ TEST_BATCH_SIZE: 32
151
+ NUM_WORKERS: 2
152
+ ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
153
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
154
+ SEQ_PER_SAMPLE: 1
155
+ SAMPLER: NodeDistributed
156
+ CACHE_MODE: True
157
+ SEQ_PER_SAMPLE: 128
158
+ MIN_SEQ_PER_SAMPLE: 128
159
+ APPEND_EOS: True
160
+ ONE_STREAM: False
161
+ SAMPLING_WEIGHT: 2.75
162
+ RANDOM_MASK: True
163
+ MODEL:
164
+ MAX_SEQ_LEN: 128
165
+ TEMP_NAME: logit_scale_text_mlm
166
+ LOSSES:
167
+ NAMES: ['CrossEntropy', 'Accuracy']
168
+ LOSS_WEIGHT: 0.5
169
+ REDUCTION: 'mean'
170
+ INFERENCE:
171
+ VOCAB: 'CLIP'
172
+ GENERATION_MODE: False
173
+
174
+
175
+ -
176
+ NAME: yfcc_caption
177
+ DATASETS:
178
+ TRAIN: 'ImageTextPairDataset'
179
+ TASK_TYPE: 'image_caption'
180
+ DATASET_NAME: 'YFCC'
181
+ TARGET_SET: ['Vocab_Word']
182
+ DATALOADER:
183
+ TRAIN_BATCH_SIZE: 200
184
+ TEST_BATCH_SIZE: 32
185
+ NUM_WORKERS: 2
186
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
187
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
188
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
189
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
190
+ S3_PATH: 'cluster2:s3://yfcc/'
191
+ SEQ_PER_SAMPLE: 1
192
+ SAMPLER: NodeDistributed
193
+ CACHE_MODE: True
194
+ CIRCULAR_CACHE_MODE: False
195
+ ZIP_MODE: False
196
+ CACHE_ORIGIN_IMAGE: False
197
+ RANDOM_CAPTION: False
198
+ AS_NUMPY_AS_POSSIBLE: False
199
+ SAMPLING_WEIGHT: 0.5840
200
+ TRANSFORM: 'clip_transforms'
201
+ MODEL:
202
+ MAX_SEQ_LEN: 50
203
+ TEMP_NAME: logit_scale_caption
204
+ LOSSES:
205
+ NAMES: ['CrossEntropy', 'Accuracy']
206
+ LOSS_WEIGHT: 1.0
207
+ REDUCTION: 'mean'
208
+ INFERENCE:
209
+ VOCAB: 'CLIP'
210
+ GENERATION_MODE: False
211
+
212
+ -
213
+ NAME: cc12m_caption
214
+ DATASETS:
215
+ TRAIN: 'ImageTextPairDataset'
216
+ TASK_TYPE: 'image_caption'
217
+ DATASET_NAME: 'CC12M'
218
+ TARGET_SET: ['Vocab_Word']
219
+ DATALOADER:
220
+ TRAIN_BATCH_SIZE: 200
221
+ TEST_BATCH_SIZE: 32
222
+ NUM_WORKERS: 2
223
+ S3_ANNO_FOLDER: 's3://cc12m/'
224
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
225
+ ANNO_FILENAME: 'train_available.json'
226
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
227
+ S3_PATH: 's3://cc12m/'
228
+ SEQ_PER_SAMPLE: 1
229
+ SAMPLER: NodeDistributed
230
+ CACHE_MODE: True
231
+ CIRCULAR_CACHE_MODE: False
232
+ ZIP_MODE: False
233
+ CACHE_ORIGIN_IMAGE: False
234
+ RANDOM_CAPTION: False
235
+ AS_NUMPY_AS_POSSIBLE: False
236
+ SAMPLING_WEIGHT: 0.5057
237
+ TRANSFORM: 'clip_transforms'
238
+ MODEL:
239
+ MAX_SEQ_LEN: 50
240
+ TEMP_NAME: logit_scale_caption
241
+ LOSSES:
242
+ NAMES: ['CrossEntropy', 'Accuracy']
243
+ LOSS_WEIGHT: 1.0
244
+ REDUCTION: 'mean'
245
+ INFERENCE:
246
+ VOCAB: 'CLIP'
247
+ GENERATION_MODE: False
248
+
249
+ -
250
+ NAME: cc3m_caption
251
+ DATASETS:
252
+ TRAIN: 'ImageTextPairDataset'
253
+ TASK_TYPE: 'image_caption'
254
+ DATASET_NAME: 'CC3M'
255
+ TARGET_SET: ['Vocab_Word']
256
+ DATALOADER:
257
+ TRAIN_BATCH_SIZE: 200
258
+ TEST_BATCH_SIZE: 32
259
+ NUM_WORKERS: 2
260
+ S3_ANNO_FOLDER: 's3://cc3m/'
261
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
262
+ ANNO_FILENAME: 'train_spacy.json'
263
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
264
+ S3_PATH: 's3://cc3m/'
265
+ SEQ_PER_SAMPLE: 1
266
+ SAMPLER: NodeDistributed
267
+ CACHE_MODE: True
268
+ CIRCULAR_CACHE_MODE: False
269
+ ZIP_MODE: False
270
+ CACHE_ORIGIN_IMAGE: False
271
+ RANDOM_CAPTION: False
272
+ AS_NUMPY_AS_POSSIBLE: False
273
+ SAMPLING_WEIGHT: 0.26295
274
+ TRANSFORM: 'clip_transforms'
275
+ MODEL:
276
+ MAX_SEQ_LEN: 50
277
+ TEMP_NAME: logit_scale_caption
278
+ LOSSES:
279
+ NAMES: ['CrossEntropy', 'Accuracy']
280
+ LOSS_WEIGHT: 1.0
281
+ REDUCTION: 'mean'
282
+ INFERENCE:
283
+ VOCAB: 'CLIP'
284
+ GENERATION_MODE: False
285
+
286
+ -
287
+ NAME: vg_caption
288
+ DATASETS:
289
+ TRAIN: 'ImageTextPairDataset'
290
+ TASK_TYPE: 'image_caption'
291
+ DATASET_NAME: 'VG'
292
+ TARGET_SET: ['Vocab_Word']
293
+ DATALOADER:
294
+ TRAIN_BATCH_SIZE: 200
295
+ TEST_BATCH_SIZE: 32
296
+ NUM_WORKERS: 2
297
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
298
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
299
+ S3_PATH: 's3://visual_genome/images'
300
+ ANNO_FILENAME: 'vg_captions_128filter.json'
301
+ SEQ_PER_SAMPLE: 1
302
+ CACHE_MODE: True
303
+ CIRCULAR_CACHE_MODE: False
304
+ ZIP_MODE: False
305
+ CACHE_ORIGIN_IMAGE: False
306
+ RANDOM_CAPTION: False
307
+ AS_NUMPY_AS_POSSIBLE: False
308
+ SAMPLING_WEIGHT: 0.1766
309
+ TRANSFORM: 'clip_transforms'
310
+ MODEL:
311
+ MAX_SEQ_LEN: 30
312
+ TEMP_NAME: logit_scale_caption
313
+ LOSSES:
314
+ NAMES: ['CrossEntropy', 'Accuracy']
315
+ LOSS_WEIGHT: 1.0
316
+ REDUCTION: 'mean'
317
+ INFERENCE:
318
+ VOCAB: 'CLIP'
319
+ GENERATION_MODE: True
320
+
321
+
322
+ -
323
+ NAME: mscoco_caption
324
+ DATASETS:
325
+ TRAIN: 'ImageTextPairDataset'
326
+ # VAL: 'ImageTextPairDataset'
327
+ # TEST: 'ImageTextPairDataset'
328
+ TASK_TYPE: 'image_caption'
329
+ DATASET_NAME: 'MSCOCO'
330
+ TARGET_SET: ['Vocab_Word']
331
+ DATALOADER:
332
+ TRAIN_BATCH_SIZE: 200
333
+ TEST_BATCH_SIZE: 32
334
+ NUM_WORKERS: 1
335
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
336
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
337
+ S3_PATH: 's3://coco/'
338
+ SEQ_PER_SAMPLE: 1
339
+ CACHE_MODE: True
340
+ CIRCULAR_CACHE_MODE: False
341
+ ZIP_MODE: False
342
+ CACHE_ORIGIN_IMAGE: False
343
+ RANDOM_CAPTION: False
344
+ AS_NUMPY_AS_POSSIBLE: False
345
+ SAMPLING_WEIGHT: 0.1144
346
+ TRANSFORM: 'clip_transforms'
347
+ RANDOM_MASK: True
348
+ MODEL:
349
+ MAX_SEQ_LEN: 50
350
+ EVAL_MAX_SEQ_LEN: 21
351
+ TEMP_NAME: logit_scale_caption
352
+ LOSSES:
353
+ NAMES: ['CrossEntropy', 'Accuracy']
354
+ LOSS_WEIGHT: 1.0
355
+ REDUCTION: 'mean'
356
+ DECODE_STRATEGY:
357
+ NAME: 'CaptionBeamSearcherV3'
358
+ BEAM_SIZE: 2
359
+ # LEN_PENALTY: 1.0
360
+ INFERENCE:
361
+ NAME: 'COCOEvaler'
362
+ VOCAB: 'CLIP'
363
+ ID_KEY: 'image_id'
364
+ VALUE: 'caption'
365
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
366
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
367
+ GENERATION_MODE: True
368
+
369
+ -
370
+ NAME: sbu_caption
371
+ DATASETS:
372
+ TRAIN: 'ImageTextPairDataset'
373
+ TASK_TYPE: 'image_caption'
374
+ DATASET_NAME: 'SBU'
375
+ TARGET_SET: ['Vocab_Word']
376
+ DATALOADER:
377
+ TRAIN_BATCH_SIZE: 200
378
+ TEST_BATCH_SIZE: 32
379
+ NUM_WORKERS: 1
380
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
381
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
382
+ ANNO_FILENAME: 'subcaption.json'
383
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
384
+ S3_PATH: 's3://SBU/images'
385
+ SEQ_PER_SAMPLE: 1
386
+ SAMPLER: NodeDistributed
387
+ CACHE_MODE: True
388
+ CIRCULAR_CACHE_MODE: False
389
+ ZIP_MODE: False
390
+ CACHE_ORIGIN_IMAGE: False
391
+ RANDOM_CAPTION: False
392
+ AS_NUMPY_AS_POSSIBLE: False
393
+ SAMPLING_WEIGHT: 0.1383
394
+ TRANSFORM: 'clip_transforms'
395
+ MODEL:
396
+ MAX_SEQ_LEN: 50
397
+ TEMP_NAME: logit_scale_caption
398
+ LOSSES:
399
+ NAMES: ['CrossEntropy', 'Accuracy']
400
+ LOSS_WEIGHT: 1.0
401
+ REDUCTION: 'mean'
402
+ INFERENCE:
403
+ VOCAB: 'CLIP'
404
+ GENERATION_MODE: False
405
+
406
+ -
407
+ NAME: yfcc_retrieve
408
+ DATASETS:
409
+ TRAIN: 'ImageTextPairDataset'
410
+ TASK_TYPE: 'image_retrieval'
411
+ DATASET_NAME: 'YFCC'
412
+ DATALOADER:
413
+ TRAIN_BATCH_SIZE: 320
414
+ TEST_BATCH_SIZE: 32
415
+ NUM_WORKERS: 2
416
+ S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
417
+ ANNO_FOLDER: 'open_source_dataset/yfcc'
418
+ ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
419
+ FEATS_FOLDER: 'open_source_dataset/yfcc/'
420
+ S3_PATH: 'cluster2:s3://yfcc/'
421
+ SAMPLER: NodeDistributed
422
+ CACHE_MODE: True
423
+ CIRCULAR_CACHE_MODE: False
424
+ ZIP_MODE: False
425
+ CACHE_ORIGIN_IMAGE: False
426
+ RANDOM_CAPTION: False
427
+ AS_NUMPY_AS_POSSIBLE: False
428
+ SAMPLING_WEIGHT: 0.5840
429
+ TRANSFORM: 'clip_transforms'
430
+ MODEL:
431
+ MAX_SEQ_LEN: 50
432
+ TEMP_NAME: logit_scale_retrieve
433
+ LOSSES:
434
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
435
+ LABELSMOOTHING: 0.1
436
+ LOSS_WEIGHT: 0.5
437
+ REDUCTION: 'mean'
438
+ INFERENCE:
439
+ VOCAB: 'CLIP'
440
+ GENERATION_MODE: False
441
+
442
+ -
443
+ NAME: cc12m_retrieve
444
+ DATASETS:
445
+ TRAIN: 'ImageTextPairDataset'
446
+ TASK_TYPE: 'image_retrieval'
447
+ DATASET_NAME: 'CC12M'
448
+ DATALOADER:
449
+ TRAIN_BATCH_SIZE: 320
450
+ TEST_BATCH_SIZE: 32
451
+ NUM_WORKERS: 2
452
+ S3_ANNO_FOLDER: 's3://cc12m/'
453
+ ANNO_FOLDER: 'open_source_dataset/c12m/'
454
+ ANNO_FILENAME: 'train_available.json'
455
+ FEATS_FOLDER: 'open_source_dataset/c12m/'
456
+ S3_PATH: 's3://cc12m/'
457
+ SAMPLER: NodeDistributed
458
+ CACHE_MODE: True
459
+ CIRCULAR_CACHE_MODE: False
460
+ ZIP_MODE: False
461
+ CACHE_ORIGIN_IMAGE: False
462
+ RANDOM_CAPTION: False
463
+ AS_NUMPY_AS_POSSIBLE: False
464
+ SAMPLING_WEIGHT: 0.5057
465
+ TRANSFORM: 'clip_transforms'
466
+ MODEL:
467
+ MAX_SEQ_LEN: 50
468
+ TEMP_NAME: logit_scale_retrieve
469
+ LOSSES:
470
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
471
+ LABELSMOOTHING: 0.1
472
+ LOSS_WEIGHT: 0.5
473
+ REDUCTION: 'mean'
474
+ INFERENCE:
475
+ VOCAB: 'CLIP'
476
+ GENERATION_MODE: False
477
+
478
+ -
479
+ NAME: cc3m_retrieve
480
+ DATASETS:
481
+ TRAIN: 'ImageTextPairDataset'
482
+ TASK_TYPE: 'image_retrieval'
483
+ DATASET_NAME: 'CC3M'
484
+ DATALOADER:
485
+ TRAIN_BATCH_SIZE: 320
486
+ TEST_BATCH_SIZE: 32
487
+ NUM_WORKERS: 2
488
+ S3_ANNO_FOLDER: 's3://cc3m/'
489
+ ANNO_FOLDER: 'open_source_dataset/cc3m/'
490
+ ANNO_FILENAME: 'train_spacy.json'
491
+ FEATS_FOLDER: 'open_source_dataset/cc3m/'
492
+ S3_PATH: 's3://cc3m/'
493
+ SAMPLER: NodeDistributed
494
+ CACHE_MODE: True
495
+ CIRCULAR_CACHE_MODE: False
496
+ ZIP_MODE: False
497
+ CACHE_ORIGIN_IMAGE: False
498
+ RANDOM_CAPTION: False
499
+ AS_NUMPY_AS_POSSIBLE: False
500
+ SAMPLING_WEIGHT: 0.26295
501
+ TRANSFORM: 'clip_transforms'
502
+ MODEL:
503
+ MAX_SEQ_LEN: 50
504
+ TEMP_NAME: logit_scale_retrieve
505
+ LOSSES:
506
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
507
+ LABELSMOOTHING: 0.1
508
+ LOSS_WEIGHT: 0.5
509
+ REDUCTION: 'mean'
510
+ INFERENCE:
511
+ VOCAB: 'CLIP'
512
+ GENERATION_MODE: False
513
+
514
+ -
515
+ NAME: vg_retrieve
516
+ DATASETS:
517
+ TRAIN: 'ImageTextPairDataset'
518
+ TASK_TYPE: 'image_retrieval'
519
+ DATASET_NAME: 'VG'
520
+ DATALOADER:
521
+ TRAIN_BATCH_SIZE: 320
522
+ TEST_BATCH_SIZE: 32
523
+ NUM_WORKERS: 2
524
+ FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
525
+ ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
526
+ S3_PATH: 's3://visual_genome/images'
527
+ ANNO_FILENAME: 'vg_captions_128filter.json'
528
+ SEQ_PER_SAMPLE: 1
529
+ CACHE_MODE: True
530
+ CIRCULAR_CACHE_MODE: False
531
+ ZIP_MODE: False
532
+ CACHE_ORIGIN_IMAGE: False
533
+ RANDOM_CAPTION: False
534
+ AS_NUMPY_AS_POSSIBLE: False
535
+ SAMPLING_WEIGHT: 0.1766
536
+ TRANSFORM: 'clip_transforms'
537
+ MODEL:
538
+ MAX_SEQ_LEN: 30
539
+ TEMP_NAME: logit_scale_retrieve
540
+ LOSSES:
541
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
542
+ LABELSMOOTHING: 0.1
543
+ LOSS_WEIGHT: 0.5
544
+ REDUCTION: 'mean'
545
+ INFERENCE:
546
+ VOCAB: 'CLIP'
547
+ GENERATION_MODE: False
548
+
549
+ -
550
+ NAME: mscoco_retrieve
551
+ DATASETS:
552
+ TRAIN: 'ImageTextPairDataset'
553
+ # TEST: 'ImageTextPairDataset'
554
+ TASK_TYPE: 'image_retrieval'
555
+ DATASET_NAME: 'MSCOCO'
556
+ DATALOADER:
557
+ TRAIN_BATCH_SIZE: 320
558
+ TEST_BATCH_SIZE: 32
559
+ NUM_WORKERS: 1
560
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
561
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
562
+ S3_PATH: 's3://coco/'
563
+ SEQ_PER_SAMPLE: 1
564
+ CACHE_MODE: True
565
+ CIRCULAR_CACHE_MODE: False
566
+ ZIP_MODE: False
567
+ CACHE_ORIGIN_IMAGE: False
568
+ RANDOM_CAPTION: False
569
+ AS_NUMPY_AS_POSSIBLE: False
570
+ SAMPLING_WEIGHT: 0.1144
571
+ TRANSFORM: 'clip_transforms'
572
+ MODEL:
573
+ MAX_SEQ_LEN: 50
574
+ TEMP_NAME: logit_scale_retrieve
575
+ LOSSES:
576
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
577
+ LABELSMOOTHING: 0.1
578
+ LOSS_WEIGHT: 0.5
579
+ REDUCTION: 'mean'
580
+ INFERENCE:
581
+ VOCAB: 'CLIP'
582
+ ID_KEY: 'image_id'
583
+ VALUE: 'caption'
584
+ NAME: 'RetrievalEvaler'
585
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
586
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
587
+ GENERATION_MODE: False
588
+
589
+ -
590
+ NAME: sbu_retrieve
591
+ DATASETS:
592
+ TRAIN: 'ImageTextPairDataset'
593
+ TASK_TYPE: 'image_retrieval'
594
+ DATASET_NAME: 'SBU'
595
+ DATALOADER:
596
+ TRAIN_BATCH_SIZE: 320
597
+ TEST_BATCH_SIZE: 32
598
+ NUM_WORKERS: 1
599
+ S3_ANNO_FOLDER: 's3://SBU/annotations'
600
+ ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
601
+ ANNO_FILENAME: 'subcaption.json'
602
+ FEATS_FOLDER: 'open_source_dataset/sbucaption/'
603
+ S3_PATH: 's3://SBU/images'
604
+ SAMPLER: NodeDistributed
605
+ CACHE_MODE: True
606
+ CIRCULAR_CACHE_MODE: False
607
+ ZIP_MODE: False
608
+ CACHE_ORIGIN_IMAGE: False
609
+ RANDOM_CAPTION: False
610
+ AS_NUMPY_AS_POSSIBLE: False
611
+ SAMPLING_WEIGHT: 0.1383
612
+ TRANSFORM: 'clip_transforms'
613
+ MODEL:
614
+ MAX_SEQ_LEN: 50
615
+ TEMP_NAME: logit_scale_retrieve
616
+ LOSSES:
617
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
618
+ LABELSMOOTHING: 0.1
619
+ LOSS_WEIGHT: 0.5
620
+ REDUCTION: 'mean'
621
+ INFERENCE:
622
+ VOCAB: 'CLIP'
623
+ GENERATION_MODE: False
624
+
625
+
626
+ ENGINE:
627
+ NAME: 'UnifiedTrainer'
628
+
629
+ MODEL:
630
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
631
+ ENCODER: 'UnifiedBertEncoder'
632
+
633
+
634
+ SHARE_LAYERNORM: True
635
+ BERT:
636
+ NORMALIZE_DECISION: "BERTPre"
637
+ DROP_PATH_PROB: 0.1
638
+ DROP_PATH_PROB_FIXED: True
639
+
640
+ UNIFY_QKV: True
641
+
642
+ MODEL_EMA: False
643
+ MODEL_EMA_DECAY: 0.9999
644
+
645
+ MAEParamsInit: True
646
+ POSEMBEDFIX: True
647
+
648
+
649
+ IMG_INPUT_SIZE: 224
650
+ PATCH_SIZE: 16
651
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
652
+ CHECKPOINT_FILETER: False
653
+
654
+ LAYER_SCALE: True
655
+ LAYER_SCALE_INIT: 1e-3
656
+ OLD_CHECKPONT: True
657
+
658
+
659
+ DATALOADER:
660
+ USE_WEIGHTED_SAMPLER: True
661
+ UNIFIED_DATASET: True
662
+ NUM_WORKERS: 32
663
+
664
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
665
+
666
+
667
+
668
+ ####################################### Optimizer #######################################
669
+ SOLVER:
670
+ NAME: 'Adam'
671
+ TORCH_OPTIMIZER: True
672
+ PARAMS_SEPERATE: True
673
+ # PARAMS_GROUP: True
674
+ # EPOCH: 1
675
+ MAX_ITER: 45000
676
+ CHECKPOINT_PERIOD: 5000
677
+ EVAL_PERIOD: 10000000
678
+ BASE_LR: 0.00002
679
+ BIAS_LR_FACTOR: 1.0
680
+ WEIGHT_DECAY: 0.05
681
+ WEIGHT_DECAY_NORM: 0.0
682
+ WEIGHT_DECAY_BIAS: 0.0
683
+ WEIGHT_DECAY_EMBEDDING: 0.0
684
+ MOMENTUM: 0.9
685
+ DAMPENING: 0.0
686
+ NESTEROV: 0.0
687
+ BETAS: [0.9, 0.95]
688
+ EPS: 1e-6
689
+ GRAD_CLIP: 0.1
690
+ GRAD_CLIP_TYPE: 'norm'
691
+ ACCUM_ITER: 0
692
+ AMP_FP16: True
693
+ APEX_FP16: False # dangerous
694
+
695
+ WRITE_PERIOD: 50
696
+ MIN_LOSS_SCLE: 2048.0
697
+ # BF16: False # True
698
+ # ZEROSTAGE: 2
699
+
700
+ LOSS_SCALE_WINDOW: 200
701
+
702
+
703
+ ####################################### lr scheduler #######################################
704
+ LR_SCHEDULER:
705
+ NAME: 'WarmupCosine'
706
+ WARMUP: 5000
707
+ MIN_LR: 0.000001
708
+
709
+ ####################################### evaluation #######################################
710
+ INFERENCE:
711
+
712
+ VOCAB: 'CLIP'
713
+ ITER_BASED: True
714
+
715
+
716
+ find_unused_parameters: true
717
+
718
+ # ENCODERS:
719
+ # -
720
+ # NAME: VisualEncoder
721
+ # TYPE: VisualEncoder
722
+ # DROP_PATH_PROB: 0.0
723
+ # HIDDEN_SIZE: 192
724
+ # HIDDEN_DROPOUT_PROB: 0.
725
+ # HIDDEN_ACT: "gelu"
726
+ # NUM_ATTENTION_HEADS: 3
727
+ # INTERMEDIATE_SIZE: 768
728
+ # INTERMEDIATE_DROP: 0.
729
+ # FFN_DROPOUT_PROB: 0.
730
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
731
+ # NUM_HIDDEN_LAYERS: 6
732
+ # NUM_GENERATION_LAYERS: 0
733
+ # DROP_PATH_PROB_FIXED: True
734
+
735
+ # -
736
+ # NAME: TextEncoder
737
+ # TYPE: TextEncoder
738
+ # DROP_PATH_PROB: 0.0
739
+ # HIDDEN_SIZE: 192
740
+ # HIDDEN_DROPOUT_PROB: 0.
741
+ # HIDDEN_ACT: "gelu"
742
+ # NUM_ATTENTION_HEADS: 3
743
+ # INTERMEDIATE_SIZE: 768
744
+ # INTERMEDIATE_DROP: 0.
745
+ # FFN_DROPOUT_PROB: 0.
746
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
747
+ # NUM_HIDDEN_LAYERS: 6
748
+ # NUM_GENERATION_LAYERS: 0
749
+ # DROP_PATH_PROB_FIXED: True
750
+
configs/BERT_L12_H768_experiments/base_model_bert_l12_h768.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ######################################### MODEL #########################################
3
+ MODEL:
4
+ VOCAB_SIZE: 49411 # include <BOS>/<EOS>
5
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
6
+ ENCODER: 'UnifiedBertEncoder_v3'
7
+ ENCODER_DIM: 768
8
+ DECODER: ''
9
+ DECODER_DIM: 768
10
+
11
+ PREDICTOR: 'EmbedClsAsRetrievalPredictor'
12
+ FEATURE_GATHER: True
13
+ LEARN_TEMP: True
14
+ PRED_USE_NORM: True
15
+ PRED_TEMPERATURE: 0.07
16
+
17
+ BertParamsInit: True
18
+
19
+ CLS_TOKEN: False
20
+
21
+ QUEUE_LEN: 1024
22
+ MAX_LABEL_LEN: 12
23
+
24
+ OUTPUT_PROJ: True # output projection
25
+
26
+
27
+ # #################################### Token embedding ####################################
28
+ TOKEN_EMBED:
29
+ NAME: 'TokenBaseEmbedding'
30
+ DIM: 768
31
+ ACTIVATION: 'none'
32
+ USE_NORM: True
33
+ DROPOUT: 0.0
34
+ POSITION: 'NNEmbeddingEncoding'
35
+ POSITION_MAX_LEN: 512
36
+ TYPE_VOCAB_SIZE: 2
37
+
38
+ # #################################### Visual embedding ####################################
39
+ VISUAL_EMBED:
40
+ NAME: 'none'
41
+
42
+ # #################################### video embedding ####################################
43
+ VIDEO_EMBED:
44
+ NAME: 'VideoBaseEmbedding'
45
+ IN_DIM: 768
46
+ OUT_DIM: 768
47
+ ACTIVATION: 'none'
48
+ USE_NORM: True
49
+ DROPOUT: 0.0
50
+ TYPE_SIZE: 1 # video to encoder
51
+ POSITION: 'NNEmbeddingEncoding'
52
+ MAX_LENGTH: 1600
53
+ PATCH_SIZE_S: 16
54
+ PATCH_SIZE_T: 1
55
+ DIVIDE_ST_POS: True
56
+ USE_VISUAL_TOKENIZER: True
57
+ USE_VISUAL_POS: True
58
+ MAX_FRAMES: 8
59
+
60
+ ####################################### BERT ############################################
61
+ BERT:
62
+ DROP_PATH_PROB: 0.1
63
+ HIDDEN_SIZE: 768
64
+ HIDDEN_DROPOUT_PROB: 0.
65
+ HIDDEN_ACT: "gelu"
66
+ NUM_ATTENTION_HEADS: 12
67
+ INTERMEDIATE_SIZE: 3072
68
+ INTERMEDIATE_DROP: 0.
69
+ FFN_DROPOUT_PROB: 0.
70
+ ATTENTION_PROBS_DROPOUT_PROB: 0.
71
+ NUM_HIDDEN_LAYERS: 12
72
+ NUM_GENERATION_LAYERS: 0
73
+
configs/BERT_L12_H768_experiments/bw_mlm_training.yaml ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ # -
6
+ # NAME: 'ImageNet1k'
7
+ # SHARED_TARGETS_CFG:
8
+ # FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ # DISTRIBUTED: False
10
+
11
+ -
12
+ NAME: 'Vocab_Word'
13
+ SHARED_TARGETS_CFG:
14
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ DISTRIBUTED: True
16
+
17
+ TASKS:
18
+
19
+ # -
20
+ # NAME: imagenet
21
+ # DATASETS:
22
+ # TRAIN: 'ImageNetDataset'
23
+ # # VAL: 'ImageNetDataset'
24
+ # TASK_TYPE: 'image_classification'
25
+ # DATASET_NAME: 'ImageNet1k'
26
+ # TARGET_SET: ['ImageNet1k']
27
+
28
+ # DATALOADER:
29
+ # TRAIN_BATCH_SIZE: 256
30
+ # # TEST_BATCH_SIZE: 2
31
+ # NUM_WORKERS: 4
32
+ # FEATS_FOLDER: 'cluster2:s3://imagenet'
33
+ # ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
34
+ # SAMPLING_WEIGHT: 1.0
35
+ # CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
36
+ # MIXUP: 0.8
37
+ # CUTMIX: 1.0
38
+ # MIXUP_PROB: 1.0
39
+ # MIXUP_SWITCH_PROB: 0.5
40
+ # MIXUP_MODE: 'batch'
41
+ # MIXUP_LABEL_SMOOTHING: 0.1
42
+ # MODEL:
43
+ # MAX_SEQ_LEN: -1
44
+ # LABELS_NUM: 1000
45
+ # TEMP_NAME: logit_scale_img_cls
46
+ # LOSSES:
47
+ # NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
48
+ # LOSS_WEIGHT: 1.0
49
+ # REDUCTION: 'mean'
50
+ # # LOSS_FP32: True
51
+ # INFERENCE:
52
+ # NAME: 'ImageNetEvaler'
53
+ # ID_KEY: 'image_id'
54
+ # VALUE: 'cls_logits'
55
+ # VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
56
+ # TEST_ANNFILE: ''
57
+ # GENERATION_MODE: False
58
+
59
+ -
60
+ NAME: bookswiki_pretrain
61
+ DATASETS:
62
+ TRAIN: 'GeneralCorpusDataset'
63
+ TASK_TYPE: 'text_mlm'
64
+ DATASET_NAME: 'BooksWiki'
65
+ TARGET_SET: ['Vocab_Word']
66
+ DATALOADER:
67
+ TRAIN_BATCH_SIZE: 256
68
+ TEST_BATCH_SIZE: 32
69
+ NUM_WORKERS: 2
70
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
71
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
72
+ SEQ_PER_SAMPLE: 1
73
+ SAMPLER: NodeDistributed
74
+ CACHE_MODE: True
75
+ SEQ_PER_SAMPLE: 128
76
+ MIN_SEQ_PER_SAMPLE: 128
77
+ APPEND_EOS: True
78
+ ONE_STREAM: False
79
+ SAMPLING_WEIGHT: 1.0
80
+ RANDOM_MASK: True
81
+ MODEL:
82
+ MAX_SEQ_LEN: 128
83
+ TEMP_NAME: logit_scale_text_mlm
84
+ LOSSES:
85
+ NAMES: ['CrossEntropy', 'Accuracy']
86
+ LOSS_WEIGHT: 0.5
87
+ REDUCTION: 'mean'
88
+ INFERENCE:
89
+ VOCAB: 'CLIP'
90
+ GENERATION_MODE: False
91
+
92
+ # -
93
+ # NAME: mscoco_caption
94
+ # DATASETS:
95
+ # TRAIN: 'ImageTextPairDataset'
96
+ # # VAL: 'ImageTextPairDataset'
97
+ # # TEST: 'ImageTextPairDataset'
98
+ # TASK_TYPE: 'image_caption'
99
+ # DATASET_NAME: 'MSCOCO'
100
+ # TARGET_SET: ['Vocab_Word']
101
+ # DATALOADER:
102
+ # TRAIN_BATCH_SIZE: 200
103
+ # TEST_BATCH_SIZE: 32
104
+ # NUM_WORKERS: 4
105
+ # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
106
+ # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
107
+ # S3_PATH: 's3://coco/'
108
+ # SEQ_PER_SAMPLE: 1
109
+ # CACHE_MODE: True
110
+ # CIRCULAR_CACHE_MODE: False
111
+ # ZIP_MODE: False
112
+ # CACHE_ORIGIN_IMAGE: False
113
+ # RANDOM_CAPTION: False
114
+ # AS_NUMPY_AS_POSSIBLE: False
115
+ # SAMPLING_WEIGHT: 0.5
116
+ # TRANSFORM: 'clip_transforms'
117
+ # RANDOM_MASK: True
118
+ # MODEL:
119
+ # MAX_SEQ_LEN: 30
120
+ # EVAL_MAX_SEQ_LEN: 21
121
+ # TEMP_NAME: logit_scale_caption
122
+ # LOSSES:
123
+ # NAMES: ['CrossEntropy', 'Accuracy']
124
+ # LOSS_WEIGHT: 0.5
125
+ # REDUCTION: 'mean'
126
+ # DECODE_STRATEGY:
127
+ # NAME: 'CaptionBeamSearcherV3'
128
+ # BEAM_SIZE: 2
129
+ # # LEN_PENALTY: 1.0
130
+ # INFERENCE:
131
+ # NAME: 'COCOEvaler'
132
+ # VOCAB: 'CLIP'
133
+ # ID_KEY: 'image_id'
134
+ # VALUE: 'caption'
135
+ # VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
136
+ # TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
137
+ # GENERATION_MODE: True
138
+
139
+ # -
140
+ # NAME: mscoco_retrieve
141
+ # DATASETS:
142
+ # TRAIN: 'ImageTextPairDataset'
143
+ # # TEST: 'ImageTextPairDataset'
144
+ # TASK_TYPE: 'image_retrieval'
145
+ # DATASET_NAME: 'MSCOCO'
146
+ # DATALOADER:
147
+ # TRAIN_BATCH_SIZE: 256
148
+ # TEST_BATCH_SIZE: 32
149
+ # NUM_WORKERS: 1
150
+ # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
151
+ # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
152
+ # S3_PATH: 's3://coco/'
153
+ # CACHE_MODE: True
154
+ # CIRCULAR_CACHE_MODE: False
155
+ # ZIP_MODE: False
156
+ # CACHE_ORIGIN_IMAGE: False
157
+ # RANDOM_CAPTION: False
158
+ # AS_NUMPY_AS_POSSIBLE: False
159
+ # SEQ_PER_SAMPLE: 1
160
+ # SAMPLING_WEIGHT: 0.5
161
+ # TRANSFORM: 'clip_transforms'
162
+ # MODEL:
163
+ # MAX_SEQ_LEN: 30
164
+ # TEMP_NAME: logit_scale_retrieve
165
+ # LOSSES:
166
+ # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
167
+ # LABELSMOOTHING: 0.1
168
+ # LOSS_WEIGHT: 0.5
169
+ # REDUCTION: 'mean'
170
+ # INFERENCE:
171
+ # VOCAB: 'CLIP'
172
+ # ID_KEY: 'image_id'
173
+ # VALUE: 'caption'
174
+ # NAME: 'RetrievalEvaler'
175
+ # VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
176
+ # TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
177
+ # GENERATION_MODE: False
178
+
179
+
180
+
181
+ ENGINE:
182
+ NAME: 'UnifiedTrainer'
183
+
184
+ MODEL:
185
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
186
+ ENCODER: 'UnifiedBertEncoder'
187
+
188
+ IN_TUNING: True # use IN1k instead of 22k
189
+ SHARE_LAYERNORM: True
190
+ BERT:
191
+ NORMALIZE_DECISION: "BERTPre"
192
+ DROP_PATH_PROB: 0.1
193
+ DROP_PATH_PROB_FIXED: True
194
+
195
+ UNIFY_QKV: True
196
+
197
+ MODEL_EMA: False
198
+ MODEL_EMA_DECAY: 0.9999
199
+
200
+ MAEParamsInit: True
201
+ POSEMBEDFIX: True
202
+
203
+
204
+ IMG_INPUT_SIZE: 224
205
+ PATCH_SIZE: 16
206
+
207
+ LAYER_SCALE: True
208
+ LAYER_SCALE_INIT: 1e-3
209
+
210
+
211
+ DATALOADER:
212
+ USE_WEIGHTED_SAMPLER: True
213
+ UNIFIED_DATASET: True
214
+ NUM_WORKERS: 16
215
+
216
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
217
+
218
+
219
+
220
+ ####################################### Optimizer #######################################
221
+ SOLVER:
222
+ NAME: 'Adam'
223
+ TORCH_OPTIMIZER: True
224
+ PARAMS_SEPERATE: True
225
+ # PARAMS_GROUP: True
226
+ # EPOCH: 1
227
+ MAX_ITER: 450000
228
+ CHECKPOINT_PERIOD: 50000
229
+ EVAL_PERIOD: 500000
230
+ BASE_LR: 0.001
231
+ BIAS_LR_FACTOR: 1.0
232
+ WEIGHT_DECAY: 0.05
233
+ WEIGHT_DECAY_NORM: 0.0
234
+ WEIGHT_DECAY_BIAS: 0.0
235
+ WEIGHT_DECAY_EMBEDDING: 0.0
236
+ MOMENTUM: 0.9
237
+ DAMPENING: 0.0
238
+ NESTEROV: 0.0
239
+ BETAS: [0.9, 0.95]
240
+ EPS: 1e-6
241
+ GRAD_CLIP: 0.1
242
+ GRAD_CLIP_TYPE: 'norm'
243
+ ACCUM_ITER: 0
244
+ AMP_FP16: True
245
+ APEX_FP16: False # dangerous
246
+
247
+ WRITE_PERIOD: 50
248
+ MIN_LOSS_SCLE: 2048.0
249
+ # BF16: False # True
250
+ # ZEROSTAGE: 2
251
+
252
+ LOSS_SCALE_WINDOW: 200
253
+
254
+
255
+
256
+
257
+
258
+
259
+ ####################################### lr scheduler #######################################
260
+ LR_SCHEDULER:
261
+ NAME: 'WarmupCosine'
262
+ WARMUP: 20000
263
+ MIN_LR: 0.000001
264
+
265
+
266
+
267
+
268
+ ####################################### evaluation #######################################
269
+ INFERENCE:
270
+
271
+ VOCAB: 'CLIP'
272
+ ITER_BASED: True
273
+
274
+
275
+ find_unused_parameters: true
276
+
277
+ # ENCODERS:
278
+ # -
279
+ # NAME: VisualEncoder
280
+ # TYPE: VisualEncoder
281
+ # DROP_PATH_PROB: 0.0
282
+ # HIDDEN_SIZE: 192
283
+ # HIDDEN_DROPOUT_PROB: 0.
284
+ # HIDDEN_ACT: "gelu"
285
+ # NUM_ATTENTION_HEADS: 3
286
+ # INTERMEDIATE_SIZE: 768
287
+ # INTERMEDIATE_DROP: 0.
288
+ # FFN_DROPOUT_PROB: 0.
289
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
290
+ # NUM_HIDDEN_LAYERS: 6
291
+ # NUM_GENERATION_LAYERS: 0
292
+ # DROP_PATH_PROB_FIXED: True
293
+
294
+ # -
295
+ # NAME: TextEncoder
296
+ # TYPE: TextEncoder
297
+ # DROP_PATH_PROB: 0.0
298
+ # HIDDEN_SIZE: 192
299
+ # HIDDEN_DROPOUT_PROB: 0.
300
+ # HIDDEN_ACT: "gelu"
301
+ # NUM_ATTENTION_HEADS: 3
302
+ # INTERMEDIATE_SIZE: 768
303
+ # INTERMEDIATE_DROP: 0.
304
+ # FFN_DROPOUT_PROB: 0.
305
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
306
+ # NUM_HIDDEN_LAYERS: 6
307
+ # NUM_GENERATION_LAYERS: 0
308
+ # DROP_PATH_PROB_FIXED: True
309
+
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_CoLA_mlm_finetune.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'CoLA'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/CoLA_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+ TASKS:
10
+ -
11
+ NAME: CoLA
12
+ DATASETS:
13
+ TRAIN: 'GLUEDataset'
14
+ # TEST: 'GLUEDataset'
15
+ VAL: 'GLUEDataset'
16
+ TASK_TYPE: 'text_classification'
17
+ DATASET_NAME: 'CoLA'
18
+ TARGET_SET: ['CoLA']
19
+ DATALOADER:
20
+ TRAIN_BATCH_SIZE: 16
21
+ TEST_BATCH_SIZE: 64
22
+ NUM_WORKERS: 4
23
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
24
+
25
+
26
+ MODEL:
27
+ MAX_SEQ_LEN: 256
28
+ TEMP_NAME: logit_scale_text_mlm
29
+ LOSSES:
30
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
31
+ LABELSMOOTHING: 0.1
32
+ # LOSS_WEIGHT: 1
33
+ REDUCTION: 'mean'
34
+ LOSS_FP32: False
35
+ INFERENCE:
36
+ NAME: 'GLUEEvaler'
37
+ VOCAB: 'CLIP'
38
+ GENERATION_MODE: False
39
+
40
+
41
+
42
+
43
+ ENGINE:
44
+ NAME: 'UnifiedTrainer'
45
+
46
+ DATALOADER:
47
+ USE_WEIGHTED_SAMPLER: True
48
+ UNIFIED_DATASET: True
49
+ NUM_WORKERS: 16
50
+
51
+ ######################################### MODEL #########################################
52
+ MODEL:
53
+ MODEL_EMA: False
54
+ MODEL_EMA_DECAY: 0.9999
55
+
56
+ ####################################### Optimizer #######################################
57
+ SOLVER:
58
+ NAME: 'Adam'
59
+ # EPOCH: 1
60
+ MAX_ITER: 5600
61
+ CHECKPOINT_PERIOD: 1000000
62
+ EVAL_PERIOD: 200
63
+ CHECKPOINT_MAX_SAVE: 1
64
+ BASE_LR: 0.00001
65
+ BIAS_LR_FACTOR: 1.0
66
+ WEIGHT_DECAY: 0.1
67
+ WEIGHT_DECAY_NORM: 0.0
68
+ WEIGHT_DECAY_BIAS: 0.0
69
+ MOMENTUM: 0.9
70
+ DAMPENING: 0.0
71
+ NESTEROV: 0.0
72
+ BETAS: [0.9, 0.98]
73
+ EPS: 1e-8
74
+ GRAD_CLIP: 0.5
75
+ GRAD_CLIP_TYPE: 'norm'
76
+ ACCUM_ITER: 0
77
+ AMP_FP16: True
78
+ APEX_FP16: False # dangerous
79
+ WRITE_PERIOD: 20
80
+
81
+ ####################################### lr scheduler #######################################
82
+ LR_SCHEDULER:
83
+ NAME: 'WarmupCosine'
84
+ WARMUP: 400
85
+ MIN_LR: 0.00000001
86
+
87
+
88
+
89
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_MNLI_mlm_finetune.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'MNLI'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/MNLI_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+
10
+ TASKS:
11
+ -
12
+ NAME: MNLI
13
+ DATASETS:
14
+ TRAIN: 'GLUEDataset'
15
+ # TEST: 'GLUEDataset'
16
+ VAL: 'GLUEDataset'
17
+ TASK_TYPE: 'text_classification'
18
+ DATASET_NAME: 'MNLI_Match'
19
+ TARGET_SET: ['MNLI']
20
+ DATALOADER:
21
+ TRAIN_BATCH_SIZE: 32
22
+ TEST_BATCH_SIZE: 32
23
+ NUM_WORKERS: 4
24
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
25
+
26
+ MODEL:
27
+ MAX_SEQ_LEN: 256
28
+ TEMP_NAME: logit_scale_text_mlm
29
+ LOSSES:
30
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
31
+ LABELSMOOTHING: 0.1
32
+ # LOSS_WEIGHT: 1
33
+ REDUCTION: 'mean'
34
+ LOSS_FP32: False
35
+ INFERENCE:
36
+ NAME: 'GLUEEvaler'
37
+ VOCAB: 'CLIP'
38
+ GENERATION_MODE: False
39
+
40
+
41
+
42
+
43
+ ENGINE:
44
+ NAME: 'UnifiedTrainer'
45
+
46
+ DATALOADER:
47
+ USE_WEIGHTED_SAMPLER: True
48
+ UNIFIED_DATASET: True
49
+ NUM_WORKERS: 16
50
+
51
+ ######################################### MODEL #########################################
52
+ MODEL:
53
+ MODEL_EMA: False
54
+ MODEL_EMA_DECAY: 0.9999
55
+
56
+
57
+ ####################################### Optimizer #######################################
58
+ SOLVER:
59
+ NAME: 'Adam'
60
+ MAX_ITER: 125000
61
+ CHECKPOINT_PERIOD: 125000
62
+ EVAL_PERIOD: 5000
63
+ CHECKPOINT_MAX_SAVE: 1
64
+ BASE_LR: 0.00001
65
+ BIAS_LR_FACTOR: 1.0
66
+ WEIGHT_DECAY: 0.1
67
+ WEIGHT_DECAY_NORM: 0.0
68
+ WEIGHT_DECAY_BIAS: 0.0
69
+ MOMENTUM: 0.9
70
+ DAMPENING: 0.0
71
+ NESTEROV: 0.0
72
+ BETAS: [0.9, 0.98]
73
+ EPS: 1e-8
74
+ GRAD_CLIP: 0.5
75
+ GRAD_CLIP_TYPE: 'norm'
76
+ ACCUM_ITER: 0
77
+ AMP_FP16: True
78
+ APEX_FP16: False # dangerous
79
+ WRITE_PERIOD: 20
80
+
81
+ ####################################### lr scheduler #######################################
82
+ LR_SCHEDULER:
83
+ NAME: 'WarmupCosine'
84
+ WARMUP: 7500
85
+ MIN_LR: 0.00000001
86
+
87
+
88
+
89
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_MRPC_mlm_finetune.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'MRPC'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/MRPC_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+ TASKS:
10
+ -
11
+ NAME: MRPC
12
+ DATASETS:
13
+ TRAIN: 'GLUEDataset'
14
+ # TEST: 'GLUEDataset'
15
+ VAL: 'GLUEDataset'
16
+ TASK_TYPE: 'text_classification'
17
+ DATASET_NAME: 'MRPC'
18
+ TARGET_SET: ['MRPC']
19
+ DATALOADER:
20
+ TRAIN_BATCH_SIZE: 16
21
+ TEST_BATCH_SIZE: 64
22
+ NUM_WORKERS: 4
23
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
24
+
25
+ MODEL:
26
+ MAX_SEQ_LEN: 256
27
+ TEMP_NAME: logit_scale_text_mlm
28
+ LOSSES:
29
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
30
+ LABELSMOOTHING: 0.1
31
+ # LOSS_WEIGHT: 1
32
+ REDUCTION: 'mean'
33
+ LOSS_FP32: False
34
+ INFERENCE:
35
+ NAME: 'GLUEEvaler'
36
+ VOCAB: 'CLIP'
37
+ GENERATION_MODE: False
38
+
39
+
40
+
41
+
42
+ ENGINE:
43
+ NAME: 'UnifiedTrainer'
44
+
45
+ DATALOADER:
46
+ USE_WEIGHTED_SAMPLER: True
47
+ UNIFIED_DATASET: True
48
+ NUM_WORKERS: 16
49
+
50
+ ######################################### MODEL #########################################
51
+ MODEL:
52
+ MODEL_EMA: False
53
+ MODEL_EMA_DECAY: 0.9999
54
+
55
+
56
+ ####################################### Optimizer #######################################
57
+ SOLVER:
58
+ NAME: 'Adam'
59
+ MAX_ITER: 2500
60
+ CHECKPOINT_PERIOD: 10000
61
+ EVAL_PERIOD: 100
62
+ CHECKPOINT_MAX_SAVE: 2
63
+ BASE_LR: 0.00001
64
+ BIAS_LR_FACTOR: 1.0
65
+ WEIGHT_DECAY: 0.1
66
+ WEIGHT_DECAY_NORM: 0.0
67
+ WEIGHT_DECAY_BIAS: 0.0
68
+ MOMENTUM: 0.9
69
+ DAMPENING: 0.0
70
+ NESTEROV: 0.0
71
+ BETAS: [0.9, 0.98]
72
+ EPS: 1e-8
73
+ GRAD_CLIP: 0.5
74
+ GRAD_CLIP_TYPE: 'norm'
75
+ ACCUM_ITER: 0
76
+ AMP_FP16: True
77
+ APEX_FP16: False # dangerous
78
+ WRITE_PERIOD: 20
79
+
80
+ ####################################### lr scheduler #######################################
81
+ LR_SCHEDULER:
82
+ NAME: 'WarmupCosine'
83
+ WARMUP: 150
84
+ MIN_LR: 0.00000001
85
+
86
+
87
+
88
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_QNLI_mlm_finetune.yaml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'QNLI'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/QNLI_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+ TASKS:
10
+ -
11
+ NAME: QNLI
12
+ DATASETS:
13
+ TRAIN: 'GLUEDataset'
14
+ # TEST: 'GLUEDataset'
15
+ VAL: 'GLUEDataset'
16
+ TASK_TYPE: 'text_classification'
17
+ DATASET_NAME: 'QNLI'
18
+ TARGET_SET: ['QNLI']
19
+ DATALOADER:
20
+ TRAIN_BATCH_SIZE: 16
21
+ TEST_BATCH_SIZE: 64
22
+ NUM_WORKERS: 4
23
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
24
+
25
+ MODEL:
26
+ MAX_SEQ_LEN: 256
27
+ TEMP_NAME: logit_scale_text_mlm
28
+ LOSSES:
29
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
30
+ LABELSMOOTHING: 0.1
31
+ # LOSS_WEIGHT: 1
32
+ REDUCTION: 'mean'
33
+ LOSS_FP32: False
34
+ INFERENCE:
35
+ NAME: 'GLUEEvaler'
36
+ VOCAB: 'CLIP'
37
+ GENERATION_MODE: False
38
+
39
+ ENGINE:
40
+ NAME: 'UnifiedTrainer'
41
+
42
+ DATALOADER:
43
+ USE_WEIGHTED_SAMPLER: True
44
+ UNIFIED_DATASET: True
45
+ NUM_WORKERS: 16
46
+
47
+ ######################################### MODEL #########################################
48
+ MODEL:
49
+ MODEL_EMA: False
50
+ MODEL_EMA_DECAY: 0.9999
51
+
52
+
53
+ ####################################### Optimizer #######################################
54
+ SOLVER:
55
+ NAME: 'Adam'
56
+ MAX_ITER: 34000
57
+ CHECKPOINT_PERIOD: 200000
58
+ EVAL_PERIOD: 2000
59
+ CHECKPOINT_MAX_SAVE: 2
60
+ BASE_LR: 0.00001
61
+ BIAS_LR_FACTOR: 1.0
62
+ WEIGHT_DECAY: 0.1
63
+ WEIGHT_DECAY_NORM: 0.0
64
+ WEIGHT_DECAY_BIAS: 0.0
65
+ MOMENTUM: 0.9
66
+ DAMPENING: 0.0
67
+ NESTEROV: 0.0
68
+ BETAS: [0.9, 0.98]
69
+ EPS: 1e-8
70
+ GRAD_CLIP: 0.5
71
+ GRAD_CLIP_TYPE: 'norm'
72
+ ACCUM_ITER: 0
73
+ AMP_FP16: True
74
+ APEX_FP16: False # dangerous
75
+ WRITE_PERIOD: 20
76
+
77
+ ####################################### lr scheduler #######################################
78
+ LR_SCHEDULER:
79
+ NAME: 'WarmupCosine'
80
+ WARMUP: 2000
81
+ MIN_LR: 0.00000001
82
+
83
+
84
+
85
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_QQP_mlm_finetune.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'QQP'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/QQP_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+ TASKS:
10
+ -
11
+ NAME: QQP
12
+ DATASETS:
13
+ TRAIN: 'GLUEDataset'
14
+ # TEST: 'GLUEDataset'
15
+ VAL: 'GLUEDataset'
16
+ TASK_TYPE: 'text_classification'
17
+ DATASET_NAME: 'QQP'
18
+ TARGET_SET: ['QQP']
19
+ DATALOADER:
20
+ TRAIN_BATCH_SIZE: 32
21
+ TEST_BATCH_SIZE: 64
22
+ NUM_WORKERS: 4
23
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
24
+
25
+ MODEL:
26
+ MAX_SEQ_LEN: 256
27
+ TEMP_NAME: logit_scale_text_mlm
28
+ LOSSES:
29
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
30
+ LABELSMOOTHING: 0.1
31
+ # LOSS_WEIGHT: 1
32
+ REDUCTION: 'mean'
33
+ LOSS_FP32: False
34
+ INFERENCE:
35
+ NAME: 'GLUEEvaler'
36
+ VOCAB: 'CLIP'
37
+ GENERATION_MODE: False
38
+
39
+ ENGINE:
40
+ NAME: 'UnifiedTrainer'
41
+
42
+ DATALOADER:
43
+ USE_WEIGHTED_SAMPLER: True
44
+ UNIFIED_DATASET: True
45
+ NUM_WORKERS: 16
46
+
47
+ ######################################### MODEL #########################################
48
+ MODEL:
49
+ MODEL_EMA: False
50
+ MODEL_EMA_DECAY: 0.9999
51
+
52
+ ####################################### Optimizer #######################################
53
+ SOLVER:
54
+ NAME: 'Adam'
55
+ MAX_ITER: 115000
56
+ CHECKPOINT_PERIOD: 200000
57
+ EVAL_PERIOD: 5000
58
+ CHECKPOINT_MAX_SAVE: 2
59
+ BASE_LR: 0.00001
60
+ BIAS_LR_FACTOR: 1.0
61
+ WEIGHT_DECAY: 0.1
62
+ WEIGHT_DECAY_NORM: 0.0
63
+ WEIGHT_DECAY_BIAS: 0.0
64
+ MOMENTUM: 0.9
65
+ DAMPENING: 0.0
66
+ NESTEROV: 0.0
67
+ BETAS: [0.9, 0.98]
68
+ EPS: 1e-8
69
+ GRAD_CLIP: 0.5
70
+ GRAD_CLIP_TYPE: 'norm'
71
+ ACCUM_ITER: 0
72
+ AMP_FP16: True
73
+ APEX_FP16: False # dangerous
74
+ WRITE_PERIOD: 20
75
+
76
+ ####################################### lr scheduler #######################################
77
+ LR_SCHEDULER:
78
+ NAME: 'WarmupCosine'
79
+ WARMUP: 28000
80
+ MIN_LR: 0.00000001
81
+
82
+
83
+
84
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_RTE_mlm_finetune.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'RTE'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/RTE_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+
10
+
11
+ TASKS:
12
+ -
13
+ NAME: RTE
14
+ DATASETS:
15
+ TRAIN: 'GLUEDataset'
16
+ # TEST: 'GLUEDataset'
17
+ VAL: 'GLUEDataset'
18
+ TASK_TYPE: 'text_classification'
19
+ DATASET_NAME: 'RTE'
20
+ TARGET_SET: ['RTE']
21
+ DATALOADER:
22
+ TRAIN_BATCH_SIZE: 16
23
+ TEST_BATCH_SIZE: 64
24
+ NUM_WORKERS: 4
25
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
26
+
27
+
28
+ MODEL:
29
+ MAX_SEQ_LEN: 256
30
+ TEMP_NAME: logit_scale_text_mlm
31
+ LOSSES:
32
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
33
+ LABELSMOOTHING: 0.1
34
+ # LOSS_WEIGHT: 1
35
+ REDUCTION: 'mean'
36
+ LOSS_FP32: False
37
+ INFERENCE:
38
+ NAME: 'GLUEEvaler'
39
+ VOCAB: 'CLIP'
40
+ GENERATION_MODE: False
41
+
42
+
43
+
44
+
45
+ ENGINE:
46
+ NAME: 'UnifiedTrainer'
47
+
48
+ DATALOADER:
49
+ USE_WEIGHTED_SAMPLER: True
50
+ UNIFIED_DATASET: True
51
+ NUM_WORKERS: 16
52
+
53
+ ######################################### MODEL #########################################
54
+ MODEL:
55
+ MODEL_EMA: False
56
+ MODEL_EMA_DECAY: 0.9999
57
+
58
+
59
+
60
+ ####################################### Optimizer #######################################
61
+ SOLVER:
62
+ NAME: 'Adam'
63
+ MAX_ITER: 2500
64
+ CHECKPOINT_PERIOD: 10000
65
+ EVAL_PERIOD: 100
66
+ CHECKPOINT_MAX_SAVE: 2
67
+ BASE_LR: 0.00002
68
+ BIAS_LR_FACTOR: 1.0
69
+ WEIGHT_DECAY: 0.1
70
+ WEIGHT_DECAY_NORM: 0.0
71
+ WEIGHT_DECAY_BIAS: 0.0
72
+ MOMENTUM: 0.9
73
+ DAMPENING: 0.0
74
+ NESTEROV: 0.0
75
+ BETAS: [0.9, 0.98]
76
+ EPS: 1e-8
77
+ GRAD_CLIP: 0.5
78
+ GRAD_CLIP_TYPE: 'norm'
79
+ ACCUM_ITER: 0
80
+ AMP_FP16: True
81
+ APEX_FP16: False # dangerous
82
+ WRITE_PERIOD: 20
83
+
84
+ ####################################### lr scheduler #######################################
85
+ LR_SCHEDULER:
86
+ NAME: 'WarmupCosine'
87
+ WARMUP: 150
88
+ MIN_LR: 0.00000001
89
+
90
+
91
+
92
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/GLUE_SST2_mlm_finetune.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'SST-2'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/SST-2_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+
10
+ TASKS:
11
+ -
12
+ NAME: SST-2
13
+ DATASETS:
14
+ TRAIN: 'GLUEDataset'
15
+ # TEST: 'GLUEDataset'
16
+ VAL: 'GLUEDataset'
17
+ TASK_TYPE: 'text_classification'
18
+ DATASET_NAME: 'SST-2'
19
+ TARGET_SET: ['SST-2']
20
+ DATALOADER:
21
+ TRAIN_BATCH_SIZE: 32
22
+ TEST_BATCH_SIZE: 64
23
+ NUM_WORKERS: 4
24
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
25
+
26
+ MODEL:
27
+ MAX_SEQ_LEN: 256
28
+ TEMP_NAME: logit_scale_text_mlm
29
+ LOSSES:
30
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
31
+ LABELSMOOTHING: 0.1
32
+ # LOSS_WEIGHT: 1
33
+ REDUCTION: 'mean'
34
+ LOSS_FP32: False
35
+ INFERENCE:
36
+ NAME: 'GLUEEvaler'
37
+ VOCAB: 'CLIP'
38
+ GENERATION_MODE: False
39
+
40
+
41
+
42
+
43
+ ENGINE:
44
+ NAME: 'UnifiedTrainer'
45
+
46
+ DATALOADER:
47
+ USE_WEIGHTED_SAMPLER: True
48
+ UNIFIED_DATASET: True
49
+ NUM_WORKERS: 16
50
+
51
+ ######################################### MODEL #########################################
52
+ MODEL:
53
+ MODEL_EMA: False
54
+ MODEL_EMA_DECAY: 0.9999
55
+
56
+
57
+ ####################################### Optimizer #######################################
58
+ SOLVER:
59
+ NAME: 'Adam'
60
+ MAX_ITER: 22000
61
+ CHECKPOINT_PERIOD: 100000
62
+ EVAL_PERIOD: 1000
63
+ CHECKPOINT_MAX_SAVE: 2
64
+ BASE_LR: 0.00001
65
+ BIAS_LR_FACTOR: 1.0
66
+ WEIGHT_DECAY: 0.1
67
+ WEIGHT_DECAY_NORM: 0.0
68
+ WEIGHT_DECAY_BIAS: 0.0
69
+ MOMENTUM: 0.9
70
+ DAMPENING: 0.0
71
+ NESTEROV: 0.0
72
+ BETAS: [0.9, 0.98]
73
+ EPS: 1e-8
74
+ GRAD_CLIP: 0.5
75
+ GRAD_CLIP_TYPE: 'norm'
76
+ ACCUM_ITER: 0
77
+ AMP_FP16: True
78
+ APEX_FP16: False # dangerous
79
+ WRITE_PERIOD: 20
80
+
81
+ ####################################### lr scheduler #######################################
82
+ LR_SCHEDULER:
83
+ NAME: 'WarmupCosine'
84
+ WARMUP: 1500
85
+ MIN_LR: 0.00000001
86
+
87
+
88
+
89
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/GLUE_finetuning_experiments/base.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../../base_model_bert_l12_h768.yaml"
2
+
3
+
4
+ MODEL:
5
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
6
+ ENCODER: 'UnifiedBertEncoder'
7
+
8
+ SHARE_LAYERNORM: True
9
+ BERT:
10
+ NORMALIZE_DECISION: "BERTPre"
11
+ DROP_PATH_PROB: 0.1
12
+ DROP_PATH_PROB_FIXED: True
13
+
14
+
15
+ MODEL_EMA: False
16
+ MODEL_EMA_DECAY: 0.9999
17
+
18
+ MAEParamsInit: True
19
+ POSEMBEDFIX: True
20
+
21
+ LAYER_SCALE: True
22
+ LAYER_SCALE_INIT: 1e-3
configs/BERT_L12_H768_experiments/finetuning/flickr30k_caption_finetuning.yaml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'Vocab_Word'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ TASKS:
12
+ -
13
+ NAME: flickr30k_caption
14
+ DATASETS:
15
+ TRAIN: 'ImageTextPairDataset'
16
+ # VAL: 'ImageTextPairDataset'
17
+ TEST: 'ImageTextPairDataset'
18
+ TASK_TYPE: 'image_caption'
19
+ DATASET_NAME: 'FLICKR'
20
+ TARGET_SET: ['Vocab_Word']
21
+ DATALOADER:
22
+ TRAIN_BATCH_SIZE: 32
23
+ TEST_BATCH_SIZE: 8
24
+ NUM_WORKERS: 4
25
+ FEATS_FOLDER: 'open_source_dataset/flickr30k_images/flickr30k_images/flickr30k_images'
26
+ ANNO_FOLDER: 'open_source_dataset/flickr30k'
27
+ S3_PATH: "s3://open_dataset/flickr30k/flickr30k_images"
28
+ SEQ_PER_SAMPLE: 1
29
+ CACHE_MODE: True
30
+ CIRCULAR_CACHE_MODE: False
31
+ ZIP_MODE: False
32
+ CACHE_ORIGIN_IMAGE: False
33
+ RANDOM_CAPTION: False
34
+ AS_NUMPY_AS_POSSIBLE: False
35
+ SAMPLING_WEIGHT: 1.0
36
+ TRANSFORM: 'clip_transforms'
37
+ RANDOM_MASK: True
38
+ MODEL:
39
+ MAX_SEQ_LEN: 30
40
+ EVAL_MAX_SEQ_LEN: 21
41
+ TEMP_NAME: logit_scale_caption
42
+ LOSSES:
43
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
44
+ # NAMES: ['CrossEntropy', 'Accuracy']
45
+ LABELSMOOTHING: 0.1
46
+ LOSS_WEIGHT: 1.0
47
+ REDUCTION: 'mean'
48
+ DECODE_STRATEGY:
49
+ NAME: 'CaptionBeamSearcherV3'
50
+ BEAM_SIZE: 2
51
+ # LEN_PENALTY: 2.0
52
+ INFERENCE:
53
+ NAME: 'COCOEvaler'
54
+ VOCAB: 'CLIP'
55
+ ID_KEY: 'image_id'
56
+ VALUE: 'caption'
57
+ VAL_ANNFILE: 'open_source_dataset/flickr30k/captions_val.json'
58
+ TEST_ANNFILE: 'open_source_dataset/flickr30k/captions_test.json'
59
+ GENERATION_MODE: True
60
+
61
+
62
+
63
+
64
+ ENGINE:
65
+ NAME: 'UnifiedTrainer'
66
+
67
+ MODEL:
68
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
69
+ ENCODER: 'UnifiedBertEncoder'
70
+
71
+ SHARE_LAYERNORM: True
72
+ BERT:
73
+ NORMALIZE_DECISION: "BERTPre"
74
+ DROP_PATH_PROB: 0.1
75
+ DROP_PATH_PROB_FIXED: True
76
+
77
+
78
+ MODEL_EMA: False
79
+ MODEL_EMA_DECAY: 0.9999
80
+
81
+ MAEParamsInit: True
82
+ POSEMBEDFIX: True
83
+
84
+
85
+ IMG_INPUT_SIZE: 224
86
+ PATCH_SIZE: 16
87
+
88
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
89
+ CHECKPOINT_FILETER: False
90
+ OLD_CHECKPONT: True
91
+
92
+ LAYER_SCALE: True
93
+ LAYER_SCALE_INIT: 1e-3
94
+
95
+
96
+ DATALOADER:
97
+ USE_WEIGHTED_SAMPLER: True
98
+ UNIFIED_DATASET: True
99
+ NUM_WORKERS: 16
100
+
101
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
102
+
103
+
104
+
105
+ ####################################### Optimizer #######################################
106
+ SOLVER:
107
+ NAME: 'Adam'
108
+ TORCH_OPTIMIZER: True
109
+ PARAMS_SEPERATE: True
110
+ # PARAMS_GROUP: True
111
+ # EPOCH: 1
112
+ MAX_ITER: 4000
113
+ CHECKPOINT_PERIOD: 50000
114
+ EVAL_PERIOD: 500
115
+ BASE_LR: 0.000002
116
+ BIAS_LR_FACTOR: 1.0
117
+ WEIGHT_DECAY: 0.0001
118
+ WEIGHT_DECAY_NORM: 0.0
119
+ WEIGHT_DECAY_BIAS: 0.0
120
+ WEIGHT_DECAY_EMBEDDING: 0.0
121
+ MOMENTUM: 0.9
122
+ DAMPENING: 0.0
123
+ NESTEROV: 0.0
124
+ BETAS: [0.9, 0.95]
125
+ EPS: 1e-6
126
+ GRAD_CLIP: 0.1
127
+ GRAD_CLIP_TYPE: 'norm'
128
+ ACCUM_ITER: 0
129
+ AMP_FP16: True
130
+ APEX_FP16: False # dangerous
131
+
132
+ WRITE_PERIOD: 50
133
+ MIN_LOSS_SCLE: 2048.0
134
+ # BF16: False # True
135
+ # ZEROSTAGE: 2
136
+
137
+ LOSS_SCALE_WINDOW: 200
138
+
139
+
140
+
141
+
142
+
143
+
144
+ ####################################### lr scheduler #######################################
145
+ LR_SCHEDULER:
146
+ NAME: 'WarmupCosine'
147
+ WARMUP: 500
148
+ MIN_LR: 0.000001
149
+
150
+
151
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/flickr30k_retrieval_finetuning.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+
4
+ TASKS:
5
+
6
+
7
+
8
+ -
9
+ NAME: flickr30k_retrieve
10
+ DATASETS:
11
+ TRAIN: 'ImageTextPairDataset'
12
+ TEST: 'ImageTextPairDataset'
13
+ TASK_TYPE: 'image_retrieval'
14
+ DATASET_NAME: 'FLICKR'
15
+ DATALOADER:
16
+ TRAIN_BATCH_SIZE: 256
17
+ TEST_BATCH_SIZE: 128
18
+ NUM_WORKERS: 2
19
+ FEATS_FOLDER: 'open_source_dataset/flickr30k_images/flickr30k_images/flickr30k_images'
20
+ ANNO_FOLDER: 'open_source_dataset/flickr30k'
21
+ S3_PATH: 's3://open_dataset/flickr30k/flickr30k_images'
22
+ SEQ_PER_SAMPLE: 1
23
+ CACHE_MODE: True
24
+ CIRCULAR_CACHE_MODE: False
25
+ ZIP_MODE: False
26
+ CACHE_ORIGIN_IMAGE: False
27
+ RANDOM_CAPTION: False
28
+ AS_NUMPY_AS_POSSIBLE: False
29
+ SAMPLING_WEIGHT: 1.0
30
+ TRANSFORM: 'clip_transforms'
31
+ MODEL:
32
+ MAX_SEQ_LEN: 30
33
+ TEMP_NAME: logit_scale_retrieve
34
+ LOSSES:
35
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
36
+ LABELSMOOTHING: 0.1
37
+ LOSS_WEIGHT: 1.0
38
+ REDUCTION: 'mean'
39
+ INFERENCE:
40
+ NAME: 'RetrievalEvaler'
41
+ GENERATION_MODE: False
42
+
43
+
44
+
45
+
46
+ ENGINE:
47
+ NAME: 'UnifiedTrainer'
48
+
49
+ MODEL:
50
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
51
+ ENCODER: 'UnifiedBertEncoder'
52
+
53
+ SHARE_LAYERNORM: True
54
+ BERT:
55
+ NORMALIZE_DECISION: "BERTPre"
56
+ DROP_PATH_PROB: 0.1
57
+ DROP_PATH_PROB_FIXED: True
58
+
59
+
60
+ MODEL_EMA: False
61
+ MODEL_EMA_DECAY: 0.9999
62
+
63
+ MAEParamsInit: True
64
+ POSEMBEDFIX: True
65
+
66
+
67
+ IMG_INPUT_SIZE: 224
68
+ PATCH_SIZE: 16
69
+
70
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
71
+ CHECKPOINT_FILETER: False
72
+ OLD_CHECKPONT: True
73
+
74
+ LAYER_SCALE: True
75
+ LAYER_SCALE_INIT: 1e-3
76
+
77
+
78
+ DATALOADER:
79
+ USE_WEIGHTED_SAMPLER: True
80
+ UNIFIED_DATASET: True
81
+ NUM_WORKERS: 16
82
+
83
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
84
+
85
+
86
+
87
+ ####################################### Optimizer #######################################
88
+ SOLVER:
89
+ NAME: 'Adam'
90
+ TORCH_OPTIMIZER: True
91
+ PARAMS_SEPERATE: True
92
+ # PARAMS_GROUP: True
93
+ # EPOCH: 1
94
+ MAX_ITER: 5000
95
+ CHECKPOINT_PERIOD: 50000
96
+ EVAL_PERIOD: 500
97
+ BASE_LR: 0.000005
98
+ BIAS_LR_FACTOR: 1.0
99
+ WEIGHT_DECAY: 0.0001
100
+ WEIGHT_DECAY_NORM: 0.0
101
+ WEIGHT_DECAY_BIAS: 0.0
102
+ WEIGHT_DECAY_EMBEDDING: 0.0
103
+ MOMENTUM: 0.9
104
+ DAMPENING: 0.0
105
+ NESTEROV: 0.0
106
+ BETAS: [0.9, 0.95]
107
+ EPS: 1e-6
108
+ GRAD_CLIP: 0.1
109
+ GRAD_CLIP_TYPE: 'norm'
110
+ ACCUM_ITER: 0
111
+ AMP_FP16: True
112
+ APEX_FP16: False # dangerous
113
+ WRITE_PERIOD: 50
114
+ MIN_LOSS_SCLE: 2048.0
115
+ # BF16: False # True
116
+ # ZEROSTAGE: 2
117
+
118
+ LOSS_SCALE_WINDOW: 200
119
+
120
+
121
+
122
+
123
+
124
+
125
+ ####################################### lr scheduler #######################################
126
+ LR_SCHEDULER:
127
+ NAME: 'WarmupCosine'
128
+ WARMUP: 200
129
+ MIN_LR: 0.000001
130
+
131
+ find_unused_parameters: true
132
+
configs/BERT_L12_H768_experiments/finetuning/in1k_training.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+
12
+
13
+ TASKS:
14
+
15
+ -
16
+ NAME: imagenet
17
+ DATASETS:
18
+ TRAIN: 'ImageNetDataset'
19
+ VAL: 'ImageNetDataset'
20
+ TASK_TYPE: 'image_classification'
21
+ DATASET_NAME: 'ImageNet1k'
22
+ TARGET_SET: ['ImageNet1k']
23
+
24
+ DATALOADER:
25
+ TRAIN_BATCH_SIZE: 128
26
+ TEST_BATCH_SIZE: 256
27
+ NUM_WORKERS: 4 # will be used as numworker for testing loader
28
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
29
+ S3_PATH: 'cluster2:s3://imagenet'
30
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
31
+ SAMPLING_WEIGHT: 1.0
32
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
33
+ MIXUP: 0.0
34
+ CUTMIX: 0.0
35
+ MIXUP_PROB: 1.0
36
+ MIXUP_SWITCH_PROB: 0.5
37
+ MIXUP_MODE: 'batch'
38
+ MIXUP_LABEL_SMOOTHING: 0.1
39
+ MODEL:
40
+ MAX_SEQ_LEN: -1
41
+ LABELS_NUM: 1000
42
+ TEMP_NAME: logit_scale_img_cls
43
+ LOSSES:
44
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
45
+ LOSS_WEIGHT: 1.0
46
+ REDUCTION: 'mean'
47
+ LABELSMOOTHING: 0.1
48
+ INFERENCE:
49
+ NAME: 'ImageNetEvaler'
50
+ ID_KEY: 'image_id'
51
+ VALUE: 'cls_logits'
52
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
53
+ TEST_ANNFILE: ''
54
+ GENERATION_MODE: False
55
+
56
+ ENGINE:
57
+ NAME: 'UnifiedTrainer'
58
+
59
+ MODEL:
60
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
61
+ ENCODER: 'UnifiedBertEncoder'
62
+
63
+ SHARE_LAYERNORM: True
64
+ BERT:
65
+ NORMALIZE_DECISION: "BERTPre"
66
+ DROP_PATH_PROB: 0.1
67
+ DROP_PATH_PROB_FIXED: True
68
+
69
+
70
+ MODEL_EMA: False
71
+ MODEL_EMA_DECAY: 0.9999
72
+
73
+ MAEParamsInit: True
74
+ POSEMBEDFIX: True
75
+
76
+
77
+ IMG_INPUT_SIZE: 224
78
+ PATCH_SIZE: 16
79
+
80
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
81
+ CHECKPOINT_FILETER: False
82
+ OLD_CHECKPONT: True
83
+
84
+ LAYER_SCALE: True
85
+ LAYER_SCALE_INIT: 1e-3
86
+
87
+
88
+ DATALOADER:
89
+ USE_WEIGHTED_SAMPLER: True
90
+ UNIFIED_DATASET: True
91
+ NUM_WORKERS: 16
92
+
93
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
94
+
95
+
96
+
97
+ ####################################### Optimizer #######################################
98
+ SOLVER:
99
+ NAME: 'Adam'
100
+ TORCH_OPTIMIZER: True
101
+ PARAMS_SEPERATE: True
102
+ # PARAMS_GROUP: True
103
+ # EPOCH: 1
104
+ MAX_ITER: 20000
105
+ CHECKPOINT_PERIOD: 20000
106
+ EVAL_PERIOD: 2000
107
+ BASE_LR: 0.00002
108
+ BIAS_LR_FACTOR: 1.0
109
+ WEIGHT_DECAY: 0.00000001
110
+ WEIGHT_DECAY_NORM: 0.0
111
+ WEIGHT_DECAY_BIAS: 0.0
112
+ WEIGHT_DECAY_EMBEDDING: 0.0
113
+ MOMENTUM: 0.9
114
+ DAMPENING: 0.0
115
+ NESTEROV: 0.0
116
+ BETAS: [0.9, 0.999]
117
+ EPS: 1e-6
118
+ GRAD_CLIP: 0.1
119
+ GRAD_CLIP_TYPE: 'norm'
120
+ ACCUM_ITER: 0
121
+ AMP_FP16: True
122
+ APEX_FP16: False # dangerous
123
+ WRITE_PERIOD: 50
124
+ MIN_LOSS_SCLE: 2048.0
125
+ LOSS_SCALE_WINDOW: 200
126
+
127
+
128
+ ####################################### lr scheduler #######################################
129
+ LR_SCHEDULER:
130
+ NAME: 'WarmupCosine'
131
+ WARMUP: 2000
132
+ MIN_LR: 0.00000001
133
+
134
+
135
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/in1k_training_384inputsize.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+
12
+
13
+ TASKS:
14
+
15
+ -
16
+ NAME: imagenet
17
+ DATASETS:
18
+ TRAIN: 'ImageNetDataset'
19
+ VAL: 'ImageNetDataset'
20
+ TASK_TYPE: 'image_classification'
21
+ DATASET_NAME: 'ImageNet1k'
22
+ TARGET_SET: ['ImageNet1k']
23
+
24
+ DATALOADER:
25
+ TRAIN_BATCH_SIZE: 64
26
+ TEST_BATCH_SIZE: 256
27
+ NUM_WORKERS: 4 # will be used as numworker for testing loader
28
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
29
+ S3_PATH: 'cluster2:s3://imagenet'
30
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
31
+ SAMPLING_WEIGHT: 1.0
32
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
33
+ MIXUP: 0.0
34
+ CUTMIX: 0.0
35
+ MIXUP_PROB: 1.0
36
+ MIXUP_SWITCH_PROB: 0.5
37
+ MIXUP_MODE: 'batch'
38
+ MIXUP_LABEL_SMOOTHING: 0.1
39
+ MODEL:
40
+ MAX_SEQ_LEN: -1
41
+ LABELS_NUM: 1000
42
+ TEMP_NAME: logit_scale_img_cls
43
+ LOSSES:
44
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
45
+ LOSS_WEIGHT: 1.0
46
+ REDUCTION: 'mean'
47
+ LABELSMOOTHING: 0.1
48
+ INFERENCE:
49
+ NAME: 'ImageNetEvaler'
50
+ ID_KEY: 'image_id'
51
+ VALUE: 'cls_logits'
52
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
53
+ TEST_ANNFILE: ''
54
+ GENERATION_MODE: False
55
+
56
+ ENGINE:
57
+ NAME: 'UnifiedTrainer'
58
+
59
+ MODEL:
60
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
61
+ ENCODER: 'UnifiedBertEncoder'
62
+
63
+ SHARE_LAYERNORM: True
64
+ BERT:
65
+ NORMALIZE_DECISION: "BERTPre"
66
+ DROP_PATH_PROB: 0.1
67
+ DROP_PATH_PROB_FIXED: True
68
+
69
+
70
+ MODEL_EMA: False
71
+ MODEL_EMA_DECAY: 0.9999
72
+
73
+ MAEParamsInit: True
74
+ POSEMBEDFIX: True
75
+
76
+
77
+ IMG_INPUT_SIZE: 384
78
+ PATCH_SIZE: 16
79
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/384"]
80
+ CHECKPOINT_FILETER: False
81
+ OLD_CHECKPONT: True
82
+
83
+ LAYER_SCALE: True
84
+ LAYER_SCALE_INIT: 1e-3
85
+
86
+
87
+ DATALOADER:
88
+ USE_WEIGHTED_SAMPLER: True
89
+ UNIFIED_DATASET: True
90
+ NUM_WORKERS: 16
91
+
92
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
93
+
94
+
95
+
96
+ ####################################### Optimizer #######################################
97
+ SOLVER:
98
+ NAME: 'Adam'
99
+ TORCH_OPTIMIZER: True
100
+ PARAMS_SEPERATE: True
101
+ # PARAMS_GROUP: True
102
+ # EPOCH: 1
103
+ MAX_ITER: 40000
104
+ CHECKPOINT_PERIOD: 40000
105
+ EVAL_PERIOD: 2000
106
+ BASE_LR: 0.00002
107
+ BIAS_LR_FACTOR: 1.0
108
+ WEIGHT_DECAY: 0.000001
109
+ WEIGHT_DECAY_NORM: 0.0
110
+ WEIGHT_DECAY_BIAS: 0.0
111
+ WEIGHT_DECAY_EMBEDDING: 0.0
112
+ MOMENTUM: 0.9
113
+ DAMPENING: 0.0
114
+ NESTEROV: 0.0
115
+ BETAS: [0.9, 0.999]
116
+ EPS: 1e-6
117
+ GRAD_CLIP: 0.0
118
+ GRAD_CLIP_TYPE: 'norm'
119
+ ACCUM_ITER: 0
120
+ AMP_FP16: True
121
+ APEX_FP16: False # dangerous
122
+ WRITE_PERIOD: 50
123
+ MIN_LOSS_SCLE: 2048.0
124
+ LOSS_SCALE_WINDOW: 200
125
+
126
+
127
+ ####################################### lr scheduler #######################################
128
+ LR_SCHEDULER:
129
+ NAME: 'WarmupCosine'
130
+ WARMUP: 4000
131
+ MIN_LR: 0.000001
132
+
133
+
134
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/k400_training.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'Kinetics400'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/k400_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+
12
+
13
+ TASKS:
14
+
15
+ -
16
+ NAME: K400_retrieve
17
+ DATASETS:
18
+ TRAIN: 'VideoDataSet'
19
+ VAL: 'VideoDataSet'
20
+ TASK_TYPE: 'video_classification'
21
+ DATASET_NAME: 'K400'
22
+ TARGET_SET: ['Kinetics400']
23
+ DATALOADER:
24
+ TRAIN_BATCH_SIZE: 8 # 256
25
+ TEST_BATCH_SIZE: 4 # debug
26
+ NUM_WORKERS: 4 # debug 4
27
+ FEATS_FOLDER: 'open_source_dataset/K400_official'
28
+ ANNO_FOLDER: 'open_source_dataset/K400_official'
29
+ S3_PATH: 's3://K400/'
30
+ FRAMES_PER_CLIP: 8
31
+ STRIDE: 32
32
+ FILE_EXTENSION: ''
33
+ ANNO_FILE: 'annotation.json'
34
+ TIMESFORMER_AUG: True
35
+ SAMPLING_WEIGHT: 1.0
36
+ MULTI_VEIW_NUM: 4
37
+ MULTI_VEIW: 'v2'
38
+ MODEL:
39
+ MAX_SEQ_LEN: -1
40
+ TEMP_NAME: logit_scale_video_cls
41
+ LOSSES:
42
+ NAMES: ['CrossEntropy', 'Accuracy']
43
+ LOSS_WEIGHT: 1.0
44
+ INFERENCE:
45
+ NAME: 'MiTEvaler'
46
+ ID_KEY: 'video_name'
47
+ VALUE: 'label'
48
+ VAL_ANNFILE: 'open_source_dataset/K400_official/annotation.json'
49
+ TEST_ANNFILE: ''
50
+ GENERATION_MODE: False
51
+ NUM_VIEWS: 1
52
+
53
+
54
+ ENGINE:
55
+ NAME: 'UnifiedTrainer'
56
+
57
+ MODEL:
58
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
59
+ ENCODER: 'UnifiedBertEncoder'
60
+
61
+ SHARE_LAYERNORM: True
62
+ BERT:
63
+ NORMALIZE_DECISION: "BERTPre"
64
+ DROP_PATH_PROB: 0.1
65
+ DROP_PATH_PROB_FIXED: True
66
+
67
+
68
+ MODEL_EMA: False
69
+ MODEL_EMA_DECAY: 0.9999
70
+
71
+ MAEParamsInit: True
72
+ POSEMBEDFIX: True
73
+
74
+
75
+ IMG_INPUT_SIZE: 224
76
+ PATCH_SIZE: 16
77
+
78
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
79
+ CHECKPOINT_FILETER: False
80
+ OLD_CHECKPONT: True
81
+
82
+ LAYER_SCALE: True
83
+ LAYER_SCALE_INIT: 1e-3
84
+
85
+
86
+ DATALOADER:
87
+ USE_WEIGHTED_SAMPLER: True
88
+ UNIFIED_DATASET: True
89
+ NUM_WORKERS: 16
90
+
91
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
92
+
93
+
94
+
95
+ ####################################### Optimizer #######################################
96
+ SOLVER:
97
+ NAME: 'Adam'
98
+ TORCH_OPTIMIZER: True
99
+ PARAMS_SEPERATE: True
100
+ # PARAMS_GROUP: True
101
+ # EPOCH: 1
102
+ MAX_ITER: 40000
103
+ CHECKPOINT_PERIOD: 50000
104
+ EVAL_PERIOD: 2000
105
+ BASE_LR: 0.000005
106
+ BIAS_LR_FACTOR: 1.0
107
+ WEIGHT_DECAY: 0.0001
108
+ WEIGHT_DECAY_NORM: 0.0
109
+ WEIGHT_DECAY_BIAS: 0.0
110
+ WEIGHT_DECAY_EMBEDDING: 0.0
111
+ MOMENTUM: 0.9
112
+ DAMPENING: 0.0
113
+ NESTEROV: 0.0
114
+ BETAS: [0.9, 0.95]
115
+ EPS: 1e-6
116
+ GRAD_CLIP: 0.1
117
+ GRAD_CLIP_TYPE: 'norm'
118
+ ACCUM_ITER: 0
119
+ AMP_FP16: True
120
+ APEX_FP16: False # dangerous
121
+ WRITE_PERIOD: 50
122
+ MIN_LOSS_SCLE: 2048.0
123
+ LOSS_SCALE_WINDOW: 200
124
+
125
+
126
+ ####################################### lr scheduler #######################################
127
+ LR_SCHEDULER:
128
+ NAME: 'WarmupCosine'
129
+ WARMUP: 2000
130
+ MIN_LR: 0.000001
131
+
132
+
133
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/mscoco_caption_finetuning.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'Vocab_Word'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ TASKS:
12
+ -
13
+ NAME: mscoco_caption
14
+ DATASETS:
15
+ TRAIN: 'ImageTextPairDataset'
16
+ # VAL: 'ImageTextPairDataset'
17
+ TEST: 'ImageTextPairDataset'
18
+ TASK_TYPE: 'image_caption'
19
+ DATASET_NAME: 'MSCOCO'
20
+ TARGET_SET: ['Vocab_Word']
21
+ DATALOADER:
22
+ TRAIN_BATCH_SIZE: 32
23
+ TEST_BATCH_SIZE: 8
24
+ NUM_WORKERS: 4
25
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
26
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
27
+ S3_PATH: 's3://coco/'
28
+ SEQ_PER_SAMPLE: 1
29
+ CACHE_MODE: True
30
+ CIRCULAR_CACHE_MODE: False
31
+ ZIP_MODE: False
32
+ CACHE_ORIGIN_IMAGE: False
33
+ RANDOM_CAPTION: False
34
+ AS_NUMPY_AS_POSSIBLE: False
35
+ SAMPLING_WEIGHT: 1.0
36
+ TRANSFORM: 'clip_transforms'
37
+ RANDOM_MASK: True
38
+ MODEL:
39
+ MAX_SEQ_LEN: 30
40
+ EVAL_MAX_SEQ_LEN: 21
41
+ TEMP_NAME: logit_scale_caption
42
+ LOSSES:
43
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
44
+ LABELSMOOTHING: 0.1
45
+ LOSS_WEIGHT: 2.0
46
+ REDUCTION: 'mean'
47
+ DECODE_STRATEGY:
48
+ NAME: 'CaptionBeamSearcherV3'
49
+ BEAM_SIZE: 2
50
+ # LEN_PENALTY: 2.0
51
+ INFERENCE:
52
+ NAME: 'COCOEvaler'
53
+ VOCAB: 'CLIP'
54
+ ID_KEY: 'image_id'
55
+ VALUE: 'caption'
56
+ VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
57
+ TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
58
+ GENERATION_MODE: True
59
+
60
+
61
+
62
+
63
+ ENGINE:
64
+ NAME: 'UnifiedTrainer'
65
+
66
+ MODEL:
67
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
68
+ ENCODER: 'UnifiedBertEncoder'
69
+
70
+ SHARE_LAYERNORM: True
71
+ BERT:
72
+ NORMALIZE_DECISION: "BERTPre"
73
+ DROP_PATH_PROB: 0.2
74
+ DROP_PATH_PROB_FIXED: True
75
+
76
+
77
+ MODEL_EMA: False
78
+ MODEL_EMA_DECAY: 0.9999
79
+
80
+ MAEParamsInit: True
81
+ POSEMBEDFIX: True
82
+
83
+
84
+ IMG_INPUT_SIZE: 224
85
+ PATCH_SIZE: 16
86
+
87
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
88
+ CHECKPOINT_FILETER: False
89
+ OLD_CHECKPONT: True
90
+
91
+ LAYER_SCALE: True
92
+ LAYER_SCALE_INIT: 1e-3
93
+
94
+
95
+ DATALOADER:
96
+ USE_WEIGHTED_SAMPLER: True
97
+ UNIFIED_DATASET: True
98
+ NUM_WORKERS: 16
99
+
100
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
101
+
102
+
103
+
104
+ ####################################### Optimizer #######################################
105
+ SOLVER:
106
+ NAME: 'Adam'
107
+ TORCH_OPTIMIZER: True
108
+ PARAMS_SEPERATE: True
109
+ # PARAMS_GROUP: True
110
+ # EPOCH: 1
111
+ MAX_ITER: 10000
112
+ CHECKPOINT_PERIOD: 50000
113
+ EVAL_PERIOD: 500
114
+ BASE_LR: 0.00002
115
+ BIAS_LR_FACTOR: 1.0
116
+ WEIGHT_DECAY: 0.0001
117
+ WEIGHT_DECAY_NORM: 0.0
118
+ WEIGHT_DECAY_BIAS: 0.0
119
+ WEIGHT_DECAY_EMBEDDING: 0.0
120
+ MOMENTUM: 0.9
121
+ DAMPENING: 0.0
122
+ NESTEROV: 0.0
123
+ BETAS: [0.9, 0.95]
124
+ EPS: 1e-6
125
+ GRAD_CLIP: 0.1
126
+ GRAD_CLIP_TYPE: 'norm'
127
+ ACCUM_ITER: 0
128
+ AMP_FP16: True
129
+ APEX_FP16: False # dangerous
130
+
131
+ WRITE_PERIOD: 50
132
+ MIN_LOSS_SCLE: 2048.0
133
+ # BF16: False # True
134
+ # ZEROSTAGE: 2
135
+
136
+ LOSS_SCALE_WINDOW: 200
137
+
138
+
139
+
140
+
141
+
142
+
143
+ ####################################### lr scheduler #######################################
144
+ LR_SCHEDULER:
145
+ NAME: 'WarmupCosine'
146
+ WARMUP: 500
147
+ MIN_LR: 0.000001
148
+
149
+
150
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/mscoco_retrieval_finetuning.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+
4
+ TASKS:
5
+
6
+
7
+
8
+ -
9
+ NAME: mscoco_retrieve
10
+ DATASETS:
11
+ TRAIN: 'ImageTextPairDataset'
12
+ TEST: 'ImageTextPairDataset'
13
+ TASK_TYPE: 'image_retrieval'
14
+ DATASET_NAME: 'MSCOCO'
15
+ DATALOADER:
16
+ TRAIN_BATCH_SIZE: 256
17
+ TEST_BATCH_SIZE: 128
18
+ NUM_WORKERS: 2
19
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
20
+ ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
21
+ S3_PATH: 's3://coco/'
22
+ SEQ_PER_SAMPLE: 1
23
+ CACHE_MODE: True
24
+ CIRCULAR_CACHE_MODE: False
25
+ ZIP_MODE: False
26
+ CACHE_ORIGIN_IMAGE: False
27
+ RANDOM_CAPTION: False
28
+ AS_NUMPY_AS_POSSIBLE: False
29
+ SAMPLING_WEIGHT: 0.5
30
+ TRANSFORM: 'clip_transforms'
31
+ MODEL:
32
+ MAX_SEQ_LEN: 30
33
+ TEMP_NAME: logit_scale_retrieve
34
+ LOSSES:
35
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
36
+ LABELSMOOTHING: 0.1
37
+ LOSS_WEIGHT: 1.0
38
+ REDUCTION: 'mean'
39
+ INFERENCE:
40
+ NAME: 'RetrievalEvaler'
41
+ GENERATION_MODE: False
42
+
43
+
44
+
45
+
46
+ ENGINE:
47
+ NAME: 'UnifiedTrainer'
48
+
49
+ MODEL:
50
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
51
+ ENCODER: 'UnifiedBertEncoder'
52
+
53
+ SHARE_LAYERNORM: True
54
+ BERT:
55
+ NORMALIZE_DECISION: "BERTPre"
56
+ DROP_PATH_PROB: 0.2
57
+ DROP_PATH_PROB_FIXED: True
58
+
59
+
60
+ MODEL_EMA: False
61
+ MODEL_EMA_DECAY: 0.9999
62
+
63
+ MAEParamsInit: True
64
+ POSEMBEDFIX: True
65
+
66
+
67
+ IMG_INPUT_SIZE: 224
68
+ PATCH_SIZE: 16
69
+
70
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
71
+ CHECKPOINT_FILETER: False
72
+ OLD_CHECKPONT: True
73
+
74
+ LAYER_SCALE: True
75
+ LAYER_SCALE_INIT: 1e-3
76
+
77
+
78
+ DATALOADER:
79
+ USE_WEIGHTED_SAMPLER: True
80
+ UNIFIED_DATASET: True
81
+ NUM_WORKERS: 16
82
+
83
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
84
+
85
+
86
+
87
+ ####################################### Optimizer #######################################
88
+ SOLVER:
89
+ NAME: 'Adam'
90
+ TORCH_OPTIMIZER: True
91
+ PARAMS_SEPERATE: True
92
+ # PARAMS_GROUP: True
93
+ # EPOCH: 1
94
+ MAX_ITER: 10000
95
+ CHECKPOINT_PERIOD: 50000
96
+ EVAL_PERIOD: 500
97
+ BASE_LR: 0.000005
98
+ BIAS_LR_FACTOR: 1.0
99
+ WEIGHT_DECAY: 0.0001
100
+ WEIGHT_DECAY_NORM: 0.0
101
+ WEIGHT_DECAY_BIAS: 0.0
102
+ WEIGHT_DECAY_EMBEDDING: 0.0
103
+ MOMENTUM: 0.9
104
+ DAMPENING: 0.0
105
+ NESTEROV: 0.0
106
+ BETAS: [0.9, 0.95]
107
+ EPS: 1e-6
108
+ GRAD_CLIP: 0.1
109
+ GRAD_CLIP_TYPE: 'norm'
110
+ ACCUM_ITER: 0
111
+ AMP_FP16: True
112
+ APEX_FP16: False # dangerous
113
+ WRITE_PERIOD: 50
114
+ MIN_LOSS_SCLE: 2048.0
115
+ # BF16: False # True
116
+ # ZEROSTAGE: 2
117
+
118
+ LOSS_SCALE_WINDOW: 200
119
+
120
+
121
+
122
+
123
+
124
+
125
+ ####################################### lr scheduler #######################################
126
+ LR_SCHEDULER:
127
+ NAME: 'WarmupCosine'
128
+ WARMUP: 500
129
+ MIN_LR: 0.000001
130
+
131
+ find_unused_parameters: true
132
+
configs/BERT_L12_H768_experiments/finetuning/msvd_caption_finetuning.yaml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'Vocab_Word'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: True
10
+
11
+ TASKS:
12
+ -
13
+ NAME: msvd_caption
14
+ DATASETS:
15
+ TRAIN: 'MSVDDataset'
16
+ TEST: 'MSVDDataset'
17
+ TASK_TYPE: 'video_caption'
18
+ DATASET_NAME: 'MSVDDataset'
19
+ TARGET_SET: ['Vocab_Word']
20
+ DATALOADER:
21
+ TRAIN_BATCH_SIZE: 2 #6
22
+ TEST_BATCH_SIZE: 4
23
+ NUM_WORKERS: 4
24
+ FEATS_FOLDER: 'open_source_dataset/msvd_dataset/YouTubeClips'
25
+ ANNO_FOLDER: 'open_source_dataset/msvd_dataset/new_annotations'
26
+ STRIDE: 32
27
+ FRAMES_PER_CLIP: 4
28
+ S3_PATH: 's3://msvd/YouTubeClips/'
29
+ TIMESFORMER_AUG: True
30
+ SAMPLING_WEIGHT: 1.0
31
+ MODEL:
32
+ MAX_SEQ_LEN: 30
33
+ EVAL_MAX_SEQ_LEN: 21
34
+ TEMP_NAME: logit_scale_caption
35
+ LOSSES:
36
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
37
+ # NAMES: ['CrossEntropy', 'Accuracy']
38
+ LABELSMOOTHING: 0.1
39
+ LOSS_WEIGHT: 1.0
40
+ REDUCTION: 'mean'
41
+ DECODE_STRATEGY:
42
+ NAME: 'CaptionBeamSearcherV3'
43
+ BEAM_SIZE: 2
44
+ # LEN_PENALTY: 2.0
45
+ INFERENCE:
46
+ NAME: 'COCOEvaler'
47
+ VOCAB: 'CLIP'
48
+ ID_KEY: 'image_id'
49
+ VALUE: 'caption'
50
+ VAL_ANNFILE: 'open_source_dataset/msvd_dataset/new_annotations/caption_msvd_val_cocostyle.json'
51
+ TEST_ANNFILE: 'open_source_dataset/msvd_dataset/new_annotations/caption_msvd_test_cocostyle.json'
52
+ GENERATION_MODE: True
53
+
54
+
55
+
56
+
57
+ ENGINE:
58
+ NAME: 'UnifiedTrainer'
59
+
60
+ MODEL:
61
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
62
+ ENCODER: 'UnifiedBertEncoder'
63
+
64
+ SHARE_LAYERNORM: True
65
+ BERT:
66
+ NORMALIZE_DECISION: "BERTPre"
67
+ DROP_PATH_PROB: 0.1
68
+ DROP_PATH_PROB_FIXED: True
69
+
70
+
71
+ MODEL_EMA: False
72
+ MODEL_EMA_DECAY: 0.9999
73
+
74
+ MAEParamsInit: True
75
+ POSEMBEDFIX: True
76
+
77
+
78
+ IMG_INPUT_SIZE: 224
79
+ PATCH_SIZE: 16
80
+
81
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
82
+ CHECKPOINT_FILETER: False
83
+ OLD_CHECKPONT: True
84
+
85
+ LAYER_SCALE: True
86
+ LAYER_SCALE_INIT: 1e-3
87
+
88
+
89
+ DATALOADER:
90
+ USE_WEIGHTED_SAMPLER: True
91
+ UNIFIED_DATASET: True
92
+ NUM_WORKERS: 16
93
+
94
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
95
+
96
+
97
+
98
+ ####################################### Optimizer #######################################
99
+ SOLVER:
100
+ NAME: 'Adam'
101
+ TORCH_OPTIMIZER: True
102
+ PARAMS_SEPERATE: True
103
+ # PARAMS_GROUP: True
104
+ # EPOCH: 1
105
+ MAX_ITER: 1000
106
+ CHECKPOINT_PERIOD: 500
107
+ EVAL_PERIOD: 200
108
+ BASE_LR: 0.00002
109
+ BIAS_LR_FACTOR: 1.0
110
+ WEIGHT_DECAY: 0.0001
111
+ WEIGHT_DECAY_NORM: 0.0
112
+ WEIGHT_DECAY_BIAS: 0.0
113
+ WEIGHT_DECAY_EMBEDDING: 0.0
114
+ MOMENTUM: 0.9
115
+ DAMPENING: 0.0
116
+ NESTEROV: 0.0
117
+ BETAS: [0.9, 0.95]
118
+ EPS: 1e-6
119
+ GRAD_CLIP: 0.1
120
+ GRAD_CLIP_TYPE: 'norm'
121
+ ACCUM_ITER: 0
122
+ AMP_FP16: True
123
+ APEX_FP16: False # dangerous
124
+
125
+ WRITE_PERIOD: 50
126
+ MIN_LOSS_SCLE: 2048.0
127
+ # BF16: False # True
128
+ # ZEROSTAGE: 2
129
+
130
+ LOSS_SCALE_WINDOW: 200
131
+
132
+
133
+
134
+
135
+
136
+
137
+ ####################################### lr scheduler #######################################
138
+ LR_SCHEDULER:
139
+ NAME: 'WarmupCosine'
140
+ WARMUP: 100
141
+ MIN_LR: 0.000001
142
+
143
+
144
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/finetuning/msvd_retrieval_finetuning.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+
4
+ TASKS:
5
+
6
+
7
+
8
+ -
9
+ NAME: msvd_retrieval
10
+ DATASETS:
11
+ TRAIN: 'MSVDDataset'
12
+ TEST: 'MSVDDataset'
13
+ TASK_TYPE: 'video_retrieval'
14
+ DATASET_NAME: 'MSVDDataset'
15
+ # TARGET_SET: ['Vocab_Word']
16
+ DATALOADER:
17
+ TRAIN_BATCH_SIZE: 16
18
+ TEST_BATCH_SIZE: 8
19
+ NUM_WORKERS: 8
20
+ FEATS_FOLDER: 'open_source_dataset/msvd_dataset/YouTubeClips'
21
+ ANNO_FOLDER: 'open_source_dataset/msvd_dataset/new_annotations'
22
+ STRIDE: 32
23
+ FRAMES_PER_CLIP: 4
24
+ S3_PATH: 's3://msvd/YouTubeClips/'
25
+ TIMESFORMER_AUG: True
26
+ SAMPLING_WEIGHT: 1.0
27
+ MODEL:
28
+ MAX_SEQ_LEN: 30
29
+ TEMP_NAME: logit_scale_retrieve
30
+ LOSSES:
31
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
32
+ LABELSMOOTHING: 0.1
33
+ LOSS_WEIGHT: 1.0
34
+ REDUCTION: 'mean'
35
+ INFERENCE:
36
+ NAME: 'RetrievalEvaler'
37
+ GENERATION_MODE: False
38
+
39
+
40
+ ENGINE:
41
+ NAME: 'UnifiedTrainer'
42
+
43
+ MODEL:
44
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
45
+ ENCODER: 'UnifiedBertEncoder'
46
+
47
+ VIDEO_EMBED:
48
+ MAX_FRAMES: 8
49
+
50
+
51
+ SHARE_LAYERNORM: True
52
+ BERT:
53
+ NORMALIZE_DECISION: "BERTPre"
54
+ DROP_PATH_PROB: 0.1
55
+ DROP_PATH_PROB_FIXED: True
56
+
57
+
58
+ MODEL_EMA: False
59
+ MODEL_EMA_DECAY: 0.9999
60
+
61
+ MAEParamsInit: True
62
+ POSEMBEDFIX: True
63
+
64
+
65
+ IMG_INPUT_SIZE: 224
66
+ PATCH_SIZE: 16
67
+
68
+ # POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
69
+ CHECKPOINT_FILETER: False
70
+ OLD_CHECKPONT: True
71
+
72
+ LAYER_SCALE: True
73
+ LAYER_SCALE_INIT: 1e-3
74
+
75
+
76
+ DATALOADER:
77
+ USE_WEIGHTED_SAMPLER: True
78
+ UNIFIED_DATASET: True
79
+ NUM_WORKERS: 8
80
+
81
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
82
+
83
+
84
+
85
+ ####################################### Optimizer #######################################
86
+ SOLVER:
87
+ NAME: 'Adam'
88
+ TORCH_OPTIMIZER: True
89
+ PARAMS_SEPERATE: True
90
+ # PARAMS_GROUP: True
91
+ # EPOCH: 1
92
+ MAX_ITER: 2000
93
+ CHECKPOINT_PERIOD: 5000
94
+ EVAL_PERIOD: 200
95
+ BASE_LR: 0.000005
96
+ BIAS_LR_FACTOR: 1.0
97
+ WEIGHT_DECAY: 0.0001
98
+ WEIGHT_DECAY_NORM: 0.0
99
+ WEIGHT_DECAY_BIAS: 0.0
100
+ WEIGHT_DECAY_EMBEDDING: 0.0
101
+ MOMENTUM: 0.9
102
+ DAMPENING: 0.0
103
+ NESTEROV: 0.0
104
+ BETAS: [0.9, 0.95]
105
+ EPS: 1e-6
106
+ GRAD_CLIP: 0.1
107
+ GRAD_CLIP_TYPE: 'norm'
108
+ ACCUM_ITER: 0
109
+ AMP_FP16: True
110
+ APEX_FP16: False # dangerous
111
+ WRITE_PERIOD: 50
112
+ MIN_LOSS_SCLE: 2048.0
113
+ # BF16: False # True
114
+ # ZEROSTAGE: 2
115
+
116
+ LOSS_SCALE_WINDOW: 200
117
+
118
+
119
+
120
+
121
+
122
+ ####################################### lr scheduler #######################################
123
+ LR_SCHEDULER:
124
+ NAME: 'WarmupCosine'
125
+ WARMUP: 200
126
+ MIN_LR: 0.000001
127
+
128
+ find_unused_parameters: true
129
+
configs/BERT_L12_H768_experiments/finetuning/msvd_retrieval_finetuning_frames8.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+
4
+ TASKS:
5
+
6
+
7
+
8
+ -
9
+ NAME: msvd_retrieval
10
+ DATASETS:
11
+ TRAIN: 'MSVDDataset'
12
+ TEST: 'MSVDDataset'
13
+ TASK_TYPE: 'video_retrieval'
14
+ DATASET_NAME: 'MSVDDataset'
15
+ # TARGET_SET: ['Vocab_Word']
16
+ DATALOADER:
17
+ TRAIN_BATCH_SIZE: 8
18
+ TEST_BATCH_SIZE: 8
19
+ NUM_WORKERS: 4
20
+ FEATS_FOLDER: 'open_source_dataset/msvd_dataset/YouTubeClips'
21
+ ANNO_FOLDER: 'open_source_dataset/msvd_dataset/new_annotations'
22
+ STRIDE: 32
23
+ FRAMES_PER_CLIP: 8
24
+ S3_PATH: 's3://msvd/YouTubeClips/'
25
+ TIMESFORMER_AUG: True
26
+ SAMPLING_WEIGHT: 1.0
27
+ MODEL:
28
+ MAX_SEQ_LEN: 30
29
+ TEMP_NAME: logit_scale_retrieve
30
+ LOSSES:
31
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
32
+ LABELSMOOTHING: 0.1
33
+ LOSS_WEIGHT: 1.0
34
+ REDUCTION: 'mean'
35
+ INFERENCE:
36
+ NAME: 'RetrievalEvaler'
37
+ GENERATION_MODE: False
38
+
39
+
40
+ ENGINE:
41
+ NAME: 'UnifiedTrainer'
42
+
43
+ MODEL:
44
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
45
+ ENCODER: 'UnifiedBertEncoder'
46
+
47
+ SHARE_LAYERNORM: True
48
+ BERT:
49
+ NORMALIZE_DECISION: "BERTPre"
50
+ DROP_PATH_PROB: 0.1
51
+ DROP_PATH_PROB_FIXED: True
52
+
53
+
54
+ MODEL_EMA: False
55
+ MODEL_EMA_DECAY: 0.9999
56
+
57
+ MAEParamsInit: True
58
+ POSEMBEDFIX: True
59
+
60
+
61
+ IMG_INPUT_SIZE: 224
62
+ PATCH_SIZE: 16
63
+
64
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
65
+ CHECKPOINT_FILETER: False
66
+ OLD_CHECKPONT: True
67
+
68
+ LAYER_SCALE: True
69
+ LAYER_SCALE_INIT: 1e-3
70
+
71
+
72
+ DATALOADER:
73
+ USE_WEIGHTED_SAMPLER: True
74
+ UNIFIED_DATASET: True
75
+ NUM_WORKERS: 16
76
+
77
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
78
+
79
+
80
+
81
+ ####################################### Optimizer #######################################
82
+ SOLVER:
83
+ NAME: 'Adam'
84
+ TORCH_OPTIMIZER: True
85
+ PARAMS_SEPERATE: True
86
+ # PARAMS_GROUP: True
87
+ # EPOCH: 1
88
+ MAX_ITER: 5000
89
+ CHECKPOINT_PERIOD: 50000
90
+ EVAL_PERIOD: 500
91
+ BASE_LR: 0.000005
92
+ BIAS_LR_FACTOR: 1.0
93
+ WEIGHT_DECAY: 0.0001
94
+ WEIGHT_DECAY_NORM: 0.0
95
+ WEIGHT_DECAY_BIAS: 0.0
96
+ WEIGHT_DECAY_EMBEDDING: 0.0
97
+ MOMENTUM: 0.9
98
+ DAMPENING: 0.0
99
+ NESTEROV: 0.0
100
+ BETAS: [0.9, 0.95]
101
+ EPS: 1e-6
102
+ GRAD_CLIP: 0.1
103
+ GRAD_CLIP_TYPE: 'norm'
104
+ ACCUM_ITER: 0
105
+ AMP_FP16: True
106
+ APEX_FP16: False # dangerous
107
+ WRITE_PERIOD: 50
108
+ MIN_LOSS_SCLE: 2048.0
109
+ # BF16: False # True
110
+ # ZEROSTAGE: 2
111
+
112
+ LOSS_SCALE_WINDOW: 200
113
+
114
+
115
+
116
+
117
+
118
+ ####################################### lr scheduler #######################################
119
+ LR_SCHEDULER:
120
+ NAME: 'WarmupCosine'
121
+ WARMUP: 200
122
+ MIN_LR: 0.000001
123
+
124
+ find_unused_parameters: true
125
+
configs/BERT_L12_H768_experiments/finetuning/vqa_finetuning_debug.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "../base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+
6
+ -
7
+ NAME: 'VQA_Answer'
8
+ SHARED_TARGETS_CFG:
9
+ FILE_PATH: 'open_source_dataset/VQA_Answers_CLIP_with_endoftext.pkl'
10
+ DISTRIBUTED: True
11
+
12
+ TASKS:
13
+ -
14
+ NAME: vqa
15
+ DATASETS:
16
+ TRAIN: 'VQADataset'
17
+ VAL: 'VQADataset'
18
+ # TEST: 'VQADataset'
19
+ DATASET_NAME: 'VQA'
20
+ TASK_TYPE: 'vqa'
21
+ TARGET_SET: ['VQA_Answer']
22
+ DATALOADER:
23
+ TRAIN_BATCH_SIZE: 64
24
+ TEST_BATCH_SIZE: 128
25
+ NUM_WORKERS: 2
26
+ FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
27
+ ANNO_FOLDER: 'open_source_dataset/VQA'
28
+ SEQ_PER_SAMPLE: 1
29
+ MAX_FEAT_NUM: 51
30
+ SAMPLING_WEIGHT: 1.0
31
+ TRANSFORM: 'clip_transforms'
32
+ DO_AS_GEN: True
33
+ SINGLE_CLASS: True
34
+ MODEL:
35
+ MAX_SEQ_LEN: 23
36
+ TEMP_NAME: logit_scale_downstream
37
+ LOSSES:
38
+ # not single class
39
+ # NAMES: ['BCEWithLogits']
40
+ # LOSS_WEIGHT: 0.05
41
+ # for single class
42
+ NAMES: ['CrossEntropy', 'Accuracy']
43
+ LOSS_WEIGHT: 0.1
44
+ INFERENCE:
45
+ VOCAB: 'CLIP'
46
+ NAME: 'VQAEvaler'
47
+ ID_KEY: 'question_id'
48
+ VALUE: 'answer'
49
+ VAL_ANNFILE: 'open_source_dataset/VQA/val_target.pkl'
50
+ TEST_ANNFILE: ''
51
+ GENERATION_MODE: False
52
+
53
+
54
+ ######################################### Engine #########################################
55
+ ENGINE:
56
+ NAME: 'UnifiedTrainer'
57
+
58
+ MODEL:
59
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
60
+ ENCODER: 'UnifiedBertEncoder'
61
+
62
+ BERT:
63
+ DROP_PATH_PROB: 0.1
64
+
65
+
66
+ MODEL_EMA: False
67
+ MODEL_EMA_DECAY: 0.9999
68
+
69
+ MAEParamsInit: True
70
+ POSEMBEDFIX: True
71
+
72
+ TEMP_NAME: logit_scale_downstream
73
+ PRED_TEMPERATURE: 0.03
74
+ LEARN_TEMP: False
75
+ CLS_TOKEN: True
76
+
77
+ IMG_INPUT_SIZE: 224
78
+ PATCH_SIZE: 16
79
+
80
+ POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
81
+ CHECKPOINT_FILETER: False
82
+ OLD_CHECKPONT: True
83
+
84
+ LAYER_SCALE: True
85
+ LAYER_SCALE_INIT: 1e-3
86
+
87
+ DATALOADER:
88
+ USE_WEIGHTED_SAMPLER: True
89
+ UNIFIED_DATASET: True
90
+
91
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
92
+
93
+ ####################################### Optimizer #######################################
94
+ SOLVER:
95
+ NAME: 'Adam'
96
+ # EPOCH: 1
97
+ MAX_ITER: 20000
98
+ CHECKPOINT_PERIOD: 1000
99
+ EVAL_PERIOD: 1000
100
+ CHECKPOINT_MAX_SAVE: 2
101
+ BASE_LR: 0.00004
102
+ BIAS_LR_FACTOR: 1.0
103
+ WEIGHT_DECAY: 0.05
104
+ WEIGHT_DECAY_NORM: 0.0
105
+ WEIGHT_DECAY_BIAS: 0.0
106
+ MOMENTUM: 0.9
107
+ DAMPENING: 0.0
108
+ NESTEROV: 0.0
109
+ BETAS: [0.9, 0.999]
110
+ EPS: 1e-8
111
+ GRAD_CLIP: 0.0
112
+ GRAD_CLIP_TYPE: 'norm'
113
+ ACCUM_ITER: 0
114
+ AMP_FP16: True
115
+ APEX_FP16: False # dangerous
116
+ WRITE_PERIOD: 50
117
+ MIN_LOSS_SCLE: 2048.0
118
+ LOSS_SCALE_WINDOW: 500
119
+
120
+ ####################################### lr scheduler #######################################
121
+ LR_SCHEDULER:
122
+ NAME: 'WarmupCosine'
123
+ WARMUP: 1000
124
+ MIN_LR: 0.00000001
125
+
126
+
127
+ find_unused_parameters: true
configs/BERT_L12_H768_experiments/in1k_training.yaml ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base_model_bert_l12_h768.yaml"
2
+
3
+ SHARED_TARGETS:
4
+
5
+ -
6
+ NAME: 'ImageNet1k'
7
+ SHARED_TARGETS_CFG:
8
+ FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl'
9
+ DISTRIBUTED: False
10
+
11
+ # -
12
+ # NAME: 'Vocab_Word'
13
+ # SHARED_TARGETS_CFG:
14
+ # FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
15
+ # DISTRIBUTED: True
16
+
17
+ TASKS:
18
+
19
+ -
20
+ NAME: imagenet
21
+ DATASETS:
22
+ TRAIN: 'ImageNetDataset'
23
+ VAL: 'ImageNetDataset'
24
+ TASK_TYPE: 'image_classification'
25
+ DATASET_NAME: 'ImageNet1k'
26
+ TARGET_SET: ['ImageNet1k']
27
+
28
+ DATALOADER:
29
+ TRAIN_BATCH_SIZE: 128
30
+ TEST_BATCH_SIZE: 128
31
+ NUM_WORKERS: 4 # will be used as numworker for testing loader
32
+ FEATS_FOLDER: 'open_source_dataset/imagenet'
33
+ S3_PATH: 'cluster2:s3://imagenet'
34
+ ANNO_FOLDER: 'open_source_dataset/imagenet/meta'
35
+ SAMPLING_WEIGHT: 1.0
36
+ CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl'
37
+ MIXUP: 0.8
38
+ CUTMIX: 1.0
39
+ MIXUP_PROB: 1.0
40
+ MIXUP_SWITCH_PROB: 0.5
41
+ MIXUP_MODE: 'batch'
42
+ MIXUP_LABEL_SMOOTHING: 0.1
43
+ MODEL:
44
+ MAX_SEQ_LEN: -1
45
+ LABELS_NUM: 1000
46
+ TEMP_NAME: logit_scale_img_cls
47
+ LOSSES:
48
+ NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
49
+ LOSS_WEIGHT: 1.0
50
+ REDUCTION: 'mean'
51
+ # LOSS_FP32: True
52
+ INFERENCE:
53
+ NAME: 'ImageNetEvaler'
54
+ ID_KEY: 'image_id'
55
+ VALUE: 'cls_logits'
56
+ VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt'
57
+ TEST_ANNFILE: ''
58
+ GENERATION_MODE: False
59
+
60
+ # -
61
+ # NAME: bookswiki_pretrain
62
+ # DATASETS:
63
+ # TRAIN: 'GeneralCorpusDataset'
64
+ # TASK_TYPE: 'text_mlm'
65
+ # DATASET_NAME: 'BooksWiki'
66
+ # TARGET_SET: ['Vocab_Word']
67
+ # DATALOADER:
68
+ # TRAIN_BATCH_SIZE: 128
69
+ # TEST_BATCH_SIZE: 32
70
+ # NUM_WORKERS: 2
71
+ # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
72
+ # # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
73
+ # SEQ_PER_SAMPLE: 1
74
+ # SAMPLER: NodeDistributed
75
+ # CACHE_MODE: True
76
+ # SEQ_PER_SAMPLE: 128
77
+ # MIN_SEQ_PER_SAMPLE: 128
78
+ # APPEND_EOS: True
79
+ # ONE_STREAM: False
80
+ # SAMPLING_WEIGHT: 1.0
81
+ # RANDOM_MASK: True
82
+ # MODEL:
83
+ # MAX_SEQ_LEN: 128
84
+ # TEMP_NAME: logit_scale_text_mlm
85
+ # LOSSES:
86
+ # NAMES: ['CrossEntropy', 'Accuracy']
87
+ # LOSS_WEIGHT: 0.33333
88
+ # REDUCTION: 'mean'
89
+ # INFERENCE:
90
+ # VOCAB: 'CLIP'
91
+ # GENERATION_MODE: False
92
+
93
+ # -
94
+ # NAME: mscoco_caption
95
+ # DATASETS:
96
+ # TRAIN: 'ImageTextPairDataset'
97
+ # # VAL: 'ImageTextPairDataset'
98
+ # # TEST: 'ImageTextPairDataset'
99
+ # TASK_TYPE: 'image_caption'
100
+ # DATASET_NAME: 'MSCOCO'
101
+ # TARGET_SET: ['Vocab_Word']
102
+ # DATALOADER:
103
+ # TRAIN_BATCH_SIZE: 64
104
+ # TEST_BATCH_SIZE: 32
105
+ # NUM_WORKERS: 4
106
+ # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
107
+ # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
108
+ # S3_PATH: 's3://coco/'
109
+ # SEQ_PER_SAMPLE: 1
110
+ # CACHE_MODE: True
111
+ # CIRCULAR_CACHE_MODE: False
112
+ # ZIP_MODE: False
113
+ # CACHE_ORIGIN_IMAGE: False
114
+ # RANDOM_CAPTION: False
115
+ # AS_NUMPY_AS_POSSIBLE: False
116
+ # SAMPLING_WEIGHT: 1.0
117
+ # TRANSFORM: 'clip_transforms'
118
+ # RANDOM_MASK: True
119
+ # MODEL:
120
+ # MAX_SEQ_LEN: 50
121
+ # EVAL_MAX_SEQ_LEN: 21
122
+ # TEMP_NAME: logit_scale_caption
123
+ # LOSSES:
124
+ # NAMES: ['CrossEntropy', 'Accuracy']
125
+ # LOSS_WEIGHT: 0.33333
126
+ # REDUCTION: 'mean'
127
+ # DECODE_STRATEGY:
128
+ # NAME: 'CaptionBeamSearcherV3'
129
+ # BEAM_SIZE: 2
130
+ # # LEN_PENALTY: 1.0
131
+ # INFERENCE:
132
+ # NAME: 'COCOEvaler'
133
+ # VOCAB: 'CLIP'
134
+ # ID_KEY: 'image_id'
135
+ # VALUE: 'caption'
136
+ # VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
137
+ # TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
138
+ # GENERATION_MODE: True
139
+
140
+ # -
141
+ # NAME: mscoco_retrieve
142
+ # DATASETS:
143
+ # TRAIN: 'ImageTextPairDataset'
144
+ # # TEST: 'ImageTextPairDataset'
145
+ # TASK_TYPE: 'image_retrieval'
146
+ # DATASET_NAME: 'MSCOCO'
147
+ # DATALOADER:
148
+ # TRAIN_BATCH_SIZE: 100
149
+ # TEST_BATCH_SIZE: 32
150
+ # NUM_WORKERS: 1
151
+ # FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
152
+ # ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
153
+ # S3_PATH: 's3://coco/'
154
+ # SEQ_PER_SAMPLE: 1
155
+ # CACHE_MODE: True
156
+ # CIRCULAR_CACHE_MODE: False
157
+ # ZIP_MODE: False
158
+ # CACHE_ORIGIN_IMAGE: False
159
+ # RANDOM_CAPTION: False
160
+ # AS_NUMPY_AS_POSSIBLE: False
161
+ # SAMPLING_WEIGHT: 1.0
162
+ # TRANSFORM: 'clip_transforms'
163
+ # MODEL:
164
+ # MAX_SEQ_LEN: 50
165
+ # TEMP_NAME: logit_scale_retrieve
166
+ # LOSSES:
167
+ # NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
168
+ # LABELSMOOTHING: 0.1
169
+ # LOSS_WEIGHT: 1.0
170
+ # REDUCTION: 'mean'
171
+ # INFERENCE:
172
+ # VOCAB: 'CLIP'
173
+ # ID_KEY: 'image_id'
174
+ # VALUE: 'caption'
175
+ # NAME: 'RetrievalEvaler'
176
+ # VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
177
+ # TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
178
+ # GENERATION_MODE: False
179
+
180
+
181
+
182
+ ENGINE:
183
+ NAME: 'UnifiedTrainer'
184
+
185
+ MODEL:
186
+ META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
187
+ ENCODER: 'UnifiedBertEncoder'
188
+
189
+ IN_TUNING: True # use IN1k instead of 22k
190
+ SHARE_LAYERNORM: True
191
+ BERT:
192
+ NORMALIZE_DECISION: "BERTPre"
193
+ DROP_PATH_PROB: 0.1
194
+ DROP_PATH_PROB_FIXED: True
195
+
196
+ UNIFY_QKV: True
197
+
198
+ MODEL_EMA: False
199
+ MODEL_EMA_DECAY: 0.9999
200
+
201
+ MAEParamsInit: True
202
+ POSEMBEDFIX: True
203
+
204
+
205
+ IMG_INPUT_SIZE: 224
206
+ PATCH_SIZE: 16
207
+
208
+ LAYER_SCALE: True
209
+ LAYER_SCALE_INIT: 1e-3
210
+
211
+
212
+ DATALOADER:
213
+ USE_WEIGHTED_SAMPLER: True
214
+ UNIFIED_DATASET: True
215
+ NUM_WORKERS: 16
216
+
217
+ PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
218
+
219
+
220
+
221
+ ####################################### Optimizer #######################################
222
+ SOLVER:
223
+ NAME: 'Adam'
224
+ TORCH_OPTIMIZER: True
225
+ PARAMS_SEPERATE: True
226
+ # PARAMS_GROUP: True
227
+ # EPOCH: 1
228
+ MAX_ITER: 200000
229
+ CHECKPOINT_PERIOD: 50000
230
+ EVAL_PERIOD: 500000
231
+ BASE_LR: 0.001
232
+ BIAS_LR_FACTOR: 1.0
233
+ WEIGHT_DECAY: 0.05
234
+ WEIGHT_DECAY_NORM: 0.0
235
+ WEIGHT_DECAY_BIAS: 0.0
236
+ WEIGHT_DECAY_EMBEDDING: 0.0
237
+ MOMENTUM: 0.9
238
+ DAMPENING: 0.0
239
+ NESTEROV: 0.0
240
+ BETAS: [0.9, 0.95]
241
+ EPS: 1e-6
242
+ GRAD_CLIP: 0.1
243
+ GRAD_CLIP_TYPE: 'norm'
244
+ ACCUM_ITER: 0
245
+ AMP_FP16: True
246
+ APEX_FP16: False # dangerous
247
+
248
+ WRITE_PERIOD: 50
249
+ MIN_LOSS_SCLE: 2048.0
250
+ # BF16: False # True
251
+ # ZEROSTAGE: 2
252
+
253
+ LOSS_SCALE_WINDOW: 200
254
+
255
+
256
+
257
+
258
+
259
+
260
+ ####################################### lr scheduler #######################################
261
+ LR_SCHEDULER:
262
+ NAME: 'WarmupCosine'
263
+ WARMUP: 20000
264
+ MIN_LR: 0.000001
265
+
266
+
267
+
268
+
269
+ ####################################### evaluation #######################################
270
+ INFERENCE:
271
+
272
+ VOCAB: 'CLIP'
273
+ ITER_BASED: True
274
+
275
+
276
+ find_unused_parameters: true
277
+
278
+ # ENCODERS:
279
+ # -
280
+ # NAME: VisualEncoder
281
+ # TYPE: VisualEncoder
282
+ # DROP_PATH_PROB: 0.0
283
+ # HIDDEN_SIZE: 192
284
+ # HIDDEN_DROPOUT_PROB: 0.
285
+ # HIDDEN_ACT: "gelu"
286
+ # NUM_ATTENTION_HEADS: 3
287
+ # INTERMEDIATE_SIZE: 768
288
+ # INTERMEDIATE_DROP: 0.
289
+ # FFN_DROPOUT_PROB: 0.
290
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
291
+ # NUM_HIDDEN_LAYERS: 6
292
+ # NUM_GENERATION_LAYERS: 0
293
+ # DROP_PATH_PROB_FIXED: True
294
+
295
+ # -
296
+ # NAME: TextEncoder
297
+ # TYPE: TextEncoder
298
+ # DROP_PATH_PROB: 0.0
299
+ # HIDDEN_SIZE: 192
300
+ # HIDDEN_DROPOUT_PROB: 0.
301
+ # HIDDEN_ACT: "gelu"
302
+ # NUM_ATTENTION_HEADS: 3
303
+ # INTERMEDIATE_SIZE: 768
304
+ # INTERMEDIATE_DROP: 0.
305
+ # FFN_DROPOUT_PROB: 0.
306
+ # ATTENTION_PROBS_DROPOUT_PROB: 0.
307
+ # NUM_HIDDEN_LAYERS: 6
308
+ # NUM_GENERATION_LAYERS: 0
309
+ # DROP_PATH_PROB_FIXED: True
310
+
configs/BERT_L12_H768_experiments/moe_finetuning/GLUE_finetuning_experiments/GLUE_CoLA_mlm_finetune.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "base.yaml"
2
+
3
+ SHARED_TARGETS:
4
+ -
5
+ NAME: 'CoLA-target'
6
+ SHARED_TARGETS_CFG:
7
+ FILE_PATH: 'open_source_dataset/GLUE_classnames/CoLA_class_name_CLIP_with_endoftext.pkl'
8
+ DISTRIBUTED: False
9
+ TASKS:
10
+ -
11
+ NAME: CoLA
12
+ DATASETS:
13
+ TRAIN: 'GLUEDataset'
14
+ # TEST: 'GLUEDataset'
15
+ VAL: 'GLUEDataset'
16
+ TASK_TYPE: 'text_classification'
17
+ DATASET_NAME: 'CoLA'
18
+ TARGET_SET: ['CoLA-target']
19
+ DATALOADER:
20
+ TRAIN_BATCH_SIZE: 16
21
+ TEST_BATCH_SIZE: 64
22
+ NUM_WORKERS: 4
23
+ ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/glue_data/'
24
+
25
+
26
+ MODEL:
27
+ MAX_SEQ_LEN: 256
28
+ TEMP_NAME: logit_scale_text_mlm
29
+ LOSSES:
30
+ NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
31
+ LABELSMOOTHING: 0.1
32
+ # LOSS_WEIGHT: 1
33
+ REDUCTION: 'mean'
34
+ LOSS_FP32: False
35
+ INFERENCE:
36
+ NAME: 'GLUEEvaler'
37
+ VOCAB: 'CLIP'
38
+ GENERATION_MODE: False
39
+
40
+
41
+
42
+
43
+ ENGINE:
44
+ NAME: 'UnifiedTrainer'
45
+
46
+ DATALOADER:
47
+ USE_WEIGHTED_SAMPLER: True
48
+ UNIFIED_DATASET: True
49
+ NUM_WORKERS: 16
50
+
51
+ ######################################### MODEL #########################################
52
+ MODEL:
53
+ MODEL_EMA: False
54
+ MODEL_EMA_DECAY: 0.9999
55
+
56
+ ####################################### Optimizer #######################################
57
+ SOLVER:
58
+ NAME: 'Adam'
59
+ # EPOCH: 1
60
+ MAX_ITER: 5600
61
+ CHECKPOINT_PERIOD: 1000000
62
+ EVAL_PERIOD: 200
63
+ CHECKPOINT_MAX_SAVE: 1
64
+ BASE_LR: 0.00001
65
+ BIAS_LR_FACTOR: 1.0
66
+ WEIGHT_DECAY: 0.1
67
+ WEIGHT_DECAY_NORM: 0.0
68
+ WEIGHT_DECAY_BIAS: 0.0
69
+ MOMENTUM: 0.9
70
+ DAMPENING: 0.0
71
+ NESTEROV: 0.0
72
+ BETAS: [0.9, 0.98]
73
+ EPS: 1e-8
74
+ GRAD_CLIP: 0.5
75
+ GRAD_CLIP_TYPE: 'norm'
76
+ ACCUM_ITER: 0
77
+ AMP_FP16: True
78
+ APEX_FP16: False # dangerous
79
+ WRITE_PERIOD: 20
80
+
81
+ ####################################### lr scheduler #######################################
82
+ LR_SCHEDULER:
83
+ NAME: 'WarmupCosine'
84
+ WARMUP: 400
85
+ MIN_LR: 0.00000001
86
+
87
+
88
+
89
+ find_unused_parameters: true