토니의 연습장
CLIPModel 본문
[ 참고 ]
test_clip.py
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
txt = ['a piece of sushi','a dog', 'a robot']
img = Image.open('/home/joseph/study/multimodal/ai_editor/my_data/optimus.jpg')
inputs = processor(text=txt, images=img, return_tensors="pt", padding=True)
print(inputs.keys())
outputs= model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)
# 가장 높은 확률을 가지는 텍스트의 인덱스 추출
max_prob_index = torch.argmax(probs).item()
# 가장 높은 확률을 가지는 텍스트 출력
print(f"The most relevant text is: {txt[max_prob_index]}")
CLIPModel
class CLIPModel(CLIPPreTrainedModel):
Transformer
def __init__(self, config: CLIPConfig):
self.text_model = CLIPTextTransformer(text_config)
self.vision_model = CLIPVisionTransformer(vision_config)
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
def forward(
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)
text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)
Linear Projection
Visual Projection (last pooled size : 3, 512) / Text Projection (last pooled size : 1, 768) 에서
512와 768의 서로 다른 크기의 결과를 원하는 projection_dim size로 변환 및 통일시켜 줍니다.
def __init__(self, config: CLIPConfig):
self.text_model = CLIPTextTransformer(text_config)
self.vision_model = CLIPVisionTransformer(vision_config)
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
def forward(
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)
text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)
'비전 AI (VISION) > CLIP' 카테고리의 다른 글
CLIPVisionTransformer / CLIPTextTransformer (0) | 2024.08.23 |
---|---|
CLIPProcessor (1) | 2024.08.23 |
CLIP 이론 (1) | 2024.08.23 |