xingjianleng commited on
Commit
510b154
·
1 Parent(s): 033814d
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
38
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
.mdl ADDED
Binary file (73 Bytes). View file
 
.msc ADDED
Binary file (753 Bytes). View file
 
.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:v1.0.0,CreatedAt:1666686679
README.md ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tasks:
3
+ - visual-question-answering
4
+
5
+ widgets:
6
+ - task: visual-question-answering
7
+ inputs:
8
+ - type: image
9
+ name: image
10
+ title: 图片
11
+ validator:
12
+ max_size: 10M
13
+ max_resolution: 5000*5000
14
+ - type: text
15
+ name: question
16
+ title: 问题
17
+ examples:
18
+ - name: 1
19
+ title: 示例1
20
+ inputs:
21
+ - name: image
22
+ data: https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/image_mplug_vqa_5.jpg
23
+ - name: question
24
+ data: what name is this guy?
25
+ - name: 2
26
+ title: 示例2
27
+ inputs:
28
+ - name: image
29
+ data: https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/image_mplug_vqa_4.jpg
30
+ - name: question
31
+ data: what is the name of the planet?
32
+ - name: 3
33
+ title: 示例3
34
+ inputs:
35
+ - name: image
36
+ data: https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/image_mplug_vqa_1.jpg
37
+ - name: question
38
+ data: what airline owns this plane?
39
+ - name: 4
40
+ title: 示例4
41
+ inputs:
42
+ - name: image
43
+ data: http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-question-answering/visual_question_answering.png
44
+ - name: question
45
+ data: what is grown on the plant?
46
+ - name: 5
47
+ title: 示例5
48
+ inputs:
49
+ - name: image
50
+ data: https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/image_mplug_vqa_3.jpg
51
+ - name: question
52
+ data: What do you call the devices on top of the pole?
53
+ - name: 6
54
+ title: 示例6
55
+ inputs:
56
+ - name: image
57
+ data: https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/image_mplug_vqa_2.jpg
58
+ - name: question
59
+ data: what does this machine do?
60
+ inferencespec:
61
+ cpu: 4
62
+ memory: 12000
63
+ gpu: 1
64
+ gpu_memory: 16000
65
+
66
+ model-type:
67
+ - mplug
68
+
69
+ domain:
70
+ - multi-modal
71
+
72
+ frameworks:
73
+ - pytorch
74
+
75
+ backbone:
76
+ - transformer
77
+
78
+ containers:
79
+
80
+ metrics:
81
+ - accuracy
82
+
83
+ license: Apache License 2.0
84
+
85
+ finetune-support: True
86
+
87
+ language:
88
+ - en
89
+
90
+ tags:
91
+ - transformer
92
+ - Alibaba
93
+ - volume:abs/2205.12005
94
+
95
+ datasets:
96
+ train:
97
+ - 14M image-text pairs(google cc, mscoco, vg, sbu)
98
+ - VQA
99
+ test:
100
+ - VQA test set
101
+ - modelscope/vqa_trial
102
+ ---
103
+
104
+ # 视觉问答介绍
105
+ 视觉问答:给定一个问题和图片,通过图片信息来给出答案。需要模型具备多模态理解的能力,目前主流的方法大多是基于多模态预训练,最为知名的视觉问答数据集包括:VQA,GQA等。
106
+
107
+ ## 模型描述
108
+
109
+ 本任务是mPLUG,在英文VQA数据集进行finetune的视觉问答下游任务。mPLUG模型是统一理解和生成的多模态基础模型,该模型提出了基于skip-connections的高效跨模态融合框架。其中,mPLUG在VQA上支持开放阈生成,达到开放阈生成的SOTA,详见:[mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections](https://arxiv.org/abs/2205.12005)
110
+
111
+ ![mplug](./resources/model.png)
112
+
113
+ 模型生成结果如下图所示:
114
+
115
+ ![vqa_case](./resources/case.png)
116
+
117
+
118
+ ## 期望模型使用方式以及适用范围
119
+ 本模型主要用于给问题和对应图片生成答案。用户可以自行尝试各种输入文档。具体调用方式请参考代码示例。
120
+
121
+ ### 如何使用
122
+ 在安装完成MaaS-lib之后即可使用visual-question-answering的能力 (注意:模型运行约需占用 9G 内存)
123
+
124
+ #### 代码范例
125
+ ```python
126
+ from modelscope.pipelines import pipeline
127
+ from modelscope.utils.constant import Tasks
128
+
129
+ model_id = 'damo/mplug_visual-question-answering_coco_large_en'
130
+ input_vqa = {
131
+ 'image': 'https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/image_mplug_vqa.jpg',
132
+ 'question': 'What is the woman doing?',
133
+ }
134
+
135
+ pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model_id)
136
+ print(pipeline_vqa(input_vqa))
137
+
138
+ ```
139
+
140
+ ### 模型局限性以及可能的偏差
141
+ 模型在数据集上训练,有可能产生一些偏差,请用户自行评测后决定如何使用。
142
+
143
+ ## 训练数据介绍
144
+ 本模型训练数据集是VQA,数据集包含83k图片, 具体数据可以[下载](https://visualqa.org/)
145
+
146
+ ## 模型训练流程
147
+
148
+ ### 预处理
149
+ 训练数据集需要包含 image,question,answer,以下为使用截取的部分 coco_caption 数据集进行的训练代码预处理示例:
150
+ ```python
151
+ datadict = MsDataset.load('coco_captions_small_slice')
152
+ self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map(
153
+ lambda _: {
154
+ 'question': 'what the picture describes?'
155
+ }).rename_column('image:FILE',
156
+ 'image').rename_column('answer:Value', 'answer'))
157
+ self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map(
158
+ lambda _: {
159
+ 'question': 'what the picture describes?'
160
+ }).rename_column('image:FILE',
161
+ 'image').rename_column('answer:Value', 'answer'))
162
+ ```
163
+
164
+ ### 训练
165
+ 以下为使用 modelscope 中的 trainer 进行训练的代码示例:
166
+ ```python
167
+ kwargs = dict(
168
+ model='damo/mplug_visual-question-answering_coco_large_en',
169
+ train_dataset=self.train_dataset,
170
+ eval_dataset=self.test_dataset,
171
+ max_epochs=self.max_epochs,
172
+ work_dir=self.tmp_dir)
173
+
174
+ trainer: EpochBasedTrainer = build_trainer(
175
+ name=Trainers.nlp_base_trainer, default_args=kwargs)
176
+ trainer.train()
177
+ ```
178
+
179
+ ## 数据评估及结果
180
+ mPLUG在VQA数据集,同等规模和预训练数据的模型中取得SOTA,VQA榜单上排名前列
181
+
182
+ ![mplug_vqa_score](./resources/vqa_exp.png)
183
+
184
+ ![vqa_leaderboard](./resources/vqa.png)
185
+ ### 相关论文以及引用信息
186
+ 如果我们的模型对您有帮助,请您引入我们的文章:
187
+ ```BibTeX
188
+ @inproceedings{li2022mplug,
189
+ title={mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections},
190
+ author={Li, Chenliang and Xu, Haiyang and Tian, Junfeng and Wang, Wei and Yan, Ming and Bi, Bin and Ye, Jiabo and Chen, Hehong and Xu, Guohai and Cao, Zheng and Zhang, Ji and Huang, Songfang and Huang, Fei and Zhou, Jingren and Luo Si},
191
+ year={2022},
192
+ journal={arXiv}
193
+ }
194
+ ```
195
+
196
+
197
+
config.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bert_config: 'config_bert.json'
2
+
3
+ image_res: 504
4
+ batch_size_train: 128
5
+ vision_width: 1024
6
+ distill: True
7
+ clip_name: "ViT-L-14"
8
+ batch_size_test: 64
9
+ k_test: 128
10
+
11
+ alpha: 0.4
12
+ warm_up: True
13
+
14
+ eos: '[SEP]'
15
+
16
+ optimizer: {opt: adamW, lr1: 3e-5, lr2: 5e-6, weight_decay: 0.02}
17
+ schedular: {sched: cosine, lr: 3e-5, epochs: 8, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 4, cooldown_epochs: 0}
18
+
19
+ # predictor
20
+ min_length: 1
21
+ max_length: 10
22
+ beam_size: 5
23
+ add_ocr: False
24
+ add_object: False
25
+ text_encoder: 'bert-base-uncased'
26
+ text_decoder: 'bert-base-uncased'
27
+
28
+ # clip
29
+ clip_embed_dim: 768
30
+ clip_image_resolution: 224
31
+ clip_vision_layers: 24
32
+ clip_vision_width: 1024
33
+ clip_vision_patch_size: 14
34
+ clip_context_length: 77
35
+ clip_vocab_size: 49408
36
+ clip_transformer_width: 768
37
+ clip_transformer_heads: 12
38
+ clip_transformer_layers: 12
39
+
config_bert.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 768,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 3072,
11
+ "layer_norm_eps": 1e-12,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "bert",
14
+ "num_attention_heads": 12,
15
+ "num_hidden_layers": 12,
16
+ "pad_token_id": 0,
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 30522,
19
+ "encoder_width": 768,
20
+ "add_cross_attention": false,
21
+ "use_cache":false,
22
+ "gradient_checkpointing": false,
23
+ "text_encoder_layers": 6,
24
+ "fusion_layers": 6,
25
+ "text_decode_layers": 12,
26
+ "stride_layer": 6
27
+ }
configuration.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task": "visual-question-answering",
4
+ "preprocessor": {
5
+ "type": "mplug-tasks-preprocessor"
6
+ },
7
+ "model": {
8
+ "type": "mplug"
9
+ },
10
+ "pipeline": {
11
+ "type": "visual-question-answering"
12
+ },
13
+ "train": {
14
+ "work_dir": "/tmp",
15
+ "max_epochs": 3,
16
+ "dataloader": {
17
+ "batch_size_per_gpu": 2,
18
+ "workers_per_gpu": 1
19
+ },
20
+ "optimizer": {
21
+ "type": "SGD",
22
+ "lr": 0.01,
23
+ "options": {
24
+ "grad_clip": {
25
+ "max_norm": 2.0
26
+ }
27
+ }
28
+ },
29
+ "lr_scheduler": {
30
+ "type": "StepLR",
31
+ "step_size": 2,
32
+ "options": {
33
+ "warmup": {
34
+ "type": "LinearWarmup",
35
+ "warmup_iters": 2
36
+ }
37
+ }
38
+ },
39
+ "hooks": [{
40
+ "type": "CheckpointHook",
41
+ "interval": 1
42
+ }, {
43
+ "type": "TextLoggerHook",
44
+ "interval": 1
45
+ }, {
46
+ "type": "IterTimerHook"
47
+ }, {
48
+ "type": "EvaluationHook",
49
+ "interval": 1
50
+ }]
51
+ },
52
+ "evaluation": {
53
+ "dataloader": {
54
+ "batch_size_per_gpu": 2,
55
+ "workers_per_gpu": 1,
56
+ "shuffle": false
57
+ }
58
+ }
59
+ }
60
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:702b1fcb760b1a2bb4ab7f7b0d6f540aad438716f9fdb602f85e121d5a371332
3
+ size 2989186229
resources/case.png ADDED

Git LFS Details

  • SHA256: e806b101c00d1cc295cad4056d7ce63459768a8eb6a1c57fd9b8f424f3a2661d
  • Pointer size: 132 Bytes
  • Size of remote file: 2.37 MB
resources/model.png ADDED

Git LFS Details

  • SHA256: 9259f6e9f8a17dabebfe53ff853ff644361c09e8c1a5a9c49f05b38ed04afece
  • Pointer size: 131 Bytes
  • Size of remote file: 495 kB
resources/vqa.png ADDED

Git LFS Details

  • SHA256: 0b0c354d29462f66907f6091a223335cff28fe3ef2e617d99f7912b7907dcb71
  • Pointer size: 131 Bytes
  • Size of remote file: 674 kB
resources/vqa_exp.png ADDED

Git LFS Details

  • SHA256: 1e93643aed2ccdf40b08ec830c88306263b3eef5f98d0283f7c02d75e6ae1fe6
  • Pointer size: 131 Bytes
  • Size of remote file: 386 kB
vocab.txt ADDED
The diff for this file is too large to render. See raw diff