shunk031 commited on
Commit
271108b
·
verified ·
1 Parent(s): 1a8740b

Upload AestheticsPredictorV1

Browse files
Files changed (3) hide show
  1. config.json +3 -0
  2. configuration_predictor.py +39 -0
  3. modeling_v1.py +63 -0
config.json CHANGED
@@ -4,6 +4,9 @@
4
  "AestheticsPredictorV1"
5
  ],
6
  "attention_dropout": 0.0,
 
 
 
7
  "dropout": 0.0,
8
  "hidden_act": "quick_gelu",
9
  "hidden_size": 768,
 
4
  "AestheticsPredictorV1"
5
  ],
6
  "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModel": "modeling_v1.AestheticsPredictorV1"
9
+ },
10
  "dropout": 0.0,
11
  "hidden_act": "quick_gelu",
12
  "hidden_size": 768,
configuration_predictor.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.clip.configuration_clip import CLIPVisionConfig
2
+
3
+
4
+ class AestheticsPredictorConfig(CLIPVisionConfig):
5
+ model_type = "aesthetics_predictor"
6
+
7
+ def __init__(
8
+ self,
9
+ hidden_size: int = 768,
10
+ intermediate_size: int = 3072,
11
+ projection_dim: int = 512,
12
+ num_hidden_layers: int = 12,
13
+ num_attention_heads: int = 12,
14
+ num_channels: int = 3,
15
+ image_size: int = 224,
16
+ patch_size: int = 32,
17
+ hidden_act: str = "quick_gelu",
18
+ layer_norm_eps: float = 0.00001,
19
+ attention_dropout: float = 0,
20
+ initializer_range: float = 0.02,
21
+ initializer_factor: float = 1,
22
+ **kwargs,
23
+ ):
24
+ super().__init__(
25
+ hidden_size,
26
+ intermediate_size,
27
+ projection_dim,
28
+ num_hidden_layers,
29
+ num_attention_heads,
30
+ num_channels,
31
+ image_size,
32
+ patch_size,
33
+ hidden_act,
34
+ layer_norm_eps,
35
+ attention_dropout,
36
+ initializer_range,
37
+ initializer_factor,
38
+ **kwargs,
39
+ )
modeling_v1.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Final, Optional, Tuple, Union
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import CLIPVisionModelWithProjection, logging
6
+ from transformers.modeling_outputs import ImageClassifierOutputWithNoAttention
7
+
8
+ from .configuration_predictor import AestheticsPredictorConfig
9
+
10
+ logging.set_verbosity_error()
11
+
12
+ URLS: Final[Dict[str, str]] = {
13
+ "openai/clip-vit-base-patch16": "https://github.com/LAION-AI/aesthetic-predictor/raw/main/sa_0_4_vit_b_16_linear.pth",
14
+ "openai/clip-vit-base-patch32": "https://github.com/LAION-AI/aesthetic-predictor/raw/main/sa_0_4_vit_b_32_linear.pth",
15
+ "openai/clip-vit-large-patch14": "https://github.com/LAION-AI/aesthetic-predictor/raw/main/sa_0_4_vit_l_14_linear.pth",
16
+ }
17
+
18
+
19
+ class AestheticsPredictorV1(CLIPVisionModelWithProjection):
20
+ def __init__(self, config: AestheticsPredictorConfig) -> None:
21
+ super().__init__(config)
22
+ self.predictor = nn.Linear(config.projection_dim, 1)
23
+ self.post_init()
24
+
25
+ def forward(
26
+ self,
27
+ pixel_values: Optional[torch.FloatTensor] = None,
28
+ output_attentions: Optional[bool] = None,
29
+ output_hidden_states: Optional[bool] = None,
30
+ return_dict: Optional[bool] = None,
31
+ ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
32
+ return_dict = (
33
+ return_dict if return_dict is not None else self.config.use_return_dict
34
+ )
35
+
36
+ outputs = super().forward(
37
+ pixel_values=pixel_values,
38
+ output_attentions=output_attentions,
39
+ output_hidden_states=output_hidden_states,
40
+ return_dict=return_dict,
41
+ )
42
+ image_embeds = outputs[0] # image_embeds
43
+ image_embeds /= image_embeds.norm(dim=-1, keepdim=True)
44
+
45
+ prediction = self.predictor(image_embeds)
46
+
47
+ if not return_dict:
48
+ return (None, prediction, image_embeds)
49
+
50
+ return ImageClassifierOutputWithNoAttention(
51
+ loss=None,
52
+ logits=prediction,
53
+ hidden_states=image_embeds,
54
+ )
55
+
56
+
57
+ def convert_from_openai_clip(openai_model_name: str) -> AestheticsPredictorV1:
58
+ model = AestheticsPredictorV1.from_pretrained(openai_model_name)
59
+ state_dict = torch.hub.load_state_dict_from_url(URLS[openai_model_name])
60
+ model.predictor.load_state_dict(state_dict)
61
+ model.eval()
62
+
63
+ return model