토니의 연습장

CLIPModel 본문

비전 AI (VISION)/CLIP

CLIPModel

bellmake 2024. 8. 23. 15:23


[ 참고 ]

test_clip.py

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


txt = ['a piece of sushi','a dog', 'a robot']
img = Image.open('/home/joseph/study/multimodal/ai_editor/my_data/optimus.jpg')

inputs = processor(text=txt, images=img, return_tensors="pt", padding=True)
print(inputs.keys())

outputs= model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)

# 가장 높은 확률을 가지는 텍스트의 인덱스 추출
max_prob_index = torch.argmax(probs).item()

# 가장 높은 확률을 가지는 텍스트 출력
print(f"The most relevant text is: {txt[max_prob_index]}")

 

 

CLIPModel


class
 CLIPModel(CLIPPreTrainedModel):
 

 

Transformer

def __init__(self, config: CLIPConfig):
 
self.text_model = CLIPTextTransformer(text_config)
self.vision_model = CLIPVisionTransformer(vision_config)

self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)

 

def forward(
 
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)

text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)

 

Linear Projection

 

Visual Projection (last pooled size : 3, 512) / Text Projection (last pooled size : 1, 768) 에서 

512와 768의 서로 다른 크기의 결과를 원하는 projection_dim size로 변환 및 통일시켜 줍니다.

def __init__(self, config: CLIPConfig):
self.text_model = CLIPTextTransformer(text_config)
self.vision_model = CLIPVisionTransformer(vision_config)

self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
def forward(
 
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)

text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)

 

 

'비전 AI (VISION) > CLIP' 카테고리의 다른 글

CLIPVisionTransformer / CLIPTextTransformer  (0) 2024.08.23
CLIPProcessor  (1) 2024.08.23
CLIP 이론  (1) 2024.08.23