> Google colab: https://colab.research.google.com/drive/17FUjpCSzJ73VIgd6R-nB2EFvlDqYhEZm?usp=sharing
1. Setting up - installation
```python=
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install ftfy regex tqdm
!pip3 install git+https://github.com/openai/CLIP.git
!pip3 install Cython
!pip3 install scikit-image
```
2. Setting up - import
```python=
import torch
import clip
from PIL import Image
# For COCO
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14", device=device)
dataDir="/content/drive/MyDrive/Colab Notebooks/ML_COCO/COCO"
dataType='val2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir,dataType)
```
3. Setting up - COCO dataset
* 這邊使用COCO只是爲了測試model,要套用model的時候可以直接跳過這裏的code
```python=
# initialize COCO api for instance annotations
coco=COCO(annFile)
```
```python=
# display COCO categories and supercategories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))
nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n{}'.format(' '.join(nms)))
```
```python=
def get_COCO_images(category, supercategory):
#get all images containing given categories, select one at random
catIds = coco.getCatIds(catNms=[category, supercategory])
imgIds = coco.getImgIds(catIds=catIds)
#load image
loaded_image_list = []
for i in range(40):
img_info_list = coco.loadImgs(imgIds[i])[0]
loaded_img = io.imread(img_info_list['coco_url'])
loaded_image_list.append(loaded_img)
# plt.axis('off')
# plt.imshow(loaded_img)
# plt.show()
return loaded_image_list
```
* 模擬 40 個 models 以及 testing dataset 有 5 個 samples
```python=
sample_predictions = [] #each element is the predictions from all models of sample i
sample_predictions.append(get_COCO_images('cat', 'animal'))
sample_predictions.append(get_COCO_images('dog', 'animal'))
sample_predictions.append(get_COCO_images('bus', 'vehicle'))
sample_predictions.append(get_COCO_images('motorcycle', 'vehicle'))
sample_predictions.append(get_COCO_images('cake', 'food'))
```
4. CLIP model
* function to project images to clip space
```python=
#return an array containing feature vectors for each image in the input array
def project_to_CLIP(sample_preds):
with torch.no_grad():
#把每個model predict的images for a specific sample都投射到clip space
#每個image會有投射完的features
all_image_features = []
for img in sample_preds:
image = preprocess(Image.fromarray(img)).unsqueeze(0).to(device)
features = model.encode_image(image)
all_image_features.append(features)
return all_image_features
```
* function to compute aggregate representation, find the best representative image
* Calculate the aggregate representation by combining the features of all the images. One simple approach is to take the mean of all the feature vectors.
```python=
def aggregate_calculation(sample_features):
aggregate_features = torch.stack(sample_features).mean(dim=0)
#print(f'mean features: {aggregate_features}')
return aggregate_features
```
```python=
def similarity(image1_features, image2_features):
cos = torch.nn.CosineSimilarity(dim=0)
return cos(image1_features[0],image2_features[0]).item()
```
* CLIP MODEL
```python=
# input sample_predictions 應該是m x n,一個column包含每個model對第n個testing data的prediction
# m = number of models
# n = number of testing datas
def CLIP_model(sample_predictions):
best_similarity = float('-inf')
best_image_index = -1
#project images onto clip space
sample_feature_vectors = [] #照片轉成feature vectors
for sample_pred in sample_predictions:
sample_feature_vectors.append(project_to_CLIP(sample_pred))
#find the mean feature vector
sample_aggregate_features = []
for sample_features in sample_feature_vectors:
aggregate_f = aggregate_calculation(sample_features)
sample_aggregate_features.append(aggregate_f)
#compare each image's feature vector with the mean feature vector
#select the feature vector closest to the mean feature vector
final_predictions = []
for sample_indx in range(len(sample_aggregate_features)):
agf = sample_aggregate_features[sample_indx]
all_image_features = sample_feature_vectors[sample_indx]
for i, features in enumerate(all_image_features):
sim = similarity(agf, features)
if sim > best_similarity:
best_similarity = sim
best_image_index = i
final_predictions.append(best_image_index)
#to visualize the best image for each testing data
#可注解掉
for sample_indx in range(len(sample_predictions)):
f = final_predictions[sample_indx]
best_image = sample_predictions[sample_indx][f]
plt.axis('off')
plt.imshow(best_image)
plt.show()
```