In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModel
from qwen_vl_utils import process_vision_info
from PIL import Image
import os
import numpy as np
from tqdm import tqdm

# --- Configuration ---
MODEL_NAME = "microsoft/layoutlmv3-base" # You can choose other model sizes
IMAGE_DIR = "/home/nguyendc/phat-dev/clustering/extracted_images"
BATCH_SIZE = 32
# --- End Configuration ---

# Check for GPU availability
device = "cuda:1" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the model and processor
model = AutoModel.from_pretrained(
 MODEL_NAME, torch_dtype="bfloat16", device_map="cuda" # , attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)

In [None]:
def get_image_embeddings(image_paths):
 """
 Processes a batch of images and extracts their embeddings.
 """
 images_pil = []
 valid_paths = []
 for path in image_paths:
 if path.lower().endswith(('.png', '.jpg', '.jpeg')):
 try:
 # The processor expects PIL images in RGB format
 images_pil.append(Image.open(path).convert("RGB"))
 valid_paths.append(path)
 except Exception as e:
 print(f"Warning: Could not load image {path}. Skipping. Error: {e}")

 if not images_pil:
 return np.array([]), []

 # For pure vision feature extraction, we can provide an empty text prompt.
 # The processor handles tokenizing text and preparing images.
 inputs = processor(
 # text=[""] * len(images_pil),
 images=images_pil,
 padding=True,
 return_tensors="pt"
 ).to(device)

 with torch.no_grad():
 # Get the vision embeddings from the model's vision tower
 vision_outputs = model.forward(pixel_values=inputs['pixel_values'].to(dtype=model.dtype)) # , grid_thw=inputs['image_grid_thw'])
 # We'll use the pooled output as the embedding
 embeddings = vision_outputs[0][:,0,:]

 return embeddings.to(torch.float16).cpu().numpy()

In [None]:
import json

# --- Process all images in the directory ---
image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
all_embeddings = []
filepaths = []

with open("embeddings_factures_ostepoathie_1k.json", "w") as f:
 f.write("[\n")
 first = True
 for i in tqdm(range(0, len(image_files), BATCH_SIZE)):
 batch_paths = image_files[i:i+BATCH_SIZE]
 batch_embeddings = get_image_embeddings(batch_paths)
 embeddings_list = [emb.tolist() for emb in batch_embeddings]
 for path, emb in zip(batch_paths, embeddings_list):
 if not first:
 f.write(",\n")
 json.dump({"filepath": path, "embedding": emb}, f)
 first = False
 f.write("\n]\n")

print("Embeddings extracted and saved.")


In [1]:
import fiftyone as fo
import fiftyone.brain as fob
import numpy as np
from sklearn.mixture import GaussianMixture
import json

DATASET_NAME = "mock"

json_path = "./embeddings_factures_osteopathie_1k.json"

with open(json_path, "r") as file:
 embedding_data = json.load(file)

file_paths = []
embeddings = []
for i, record in enumerate(embedding_data):
 file_paths.append(record.get("filepath"))
 embeddings.append(record.get("embedding"))

if DATASET_NAME in fo.list_datasets():
 dataset = fo.load_dataset(DATASET_NAME)
 dataset.delete()
dataset = fo.Dataset(DATASET_NAME)

# Add samples to the dataset
samples = [fo.Sample(filepath=p) for p in file_paths]
dataset.add_samples(samples)

# Building Gaussian mixture model (GMM)

n_gaussians = 50
gmm = GaussianMixture(n_components=n_gaussians, random_state=42)
gmm.fit(embeddings)
cluster_labels = gmm.predict(embeddings)

# Adding labeled embeddings to visulization
dataset.add_sample_field("gmm_cluster", fo.IntField)
for sample, label in zip(dataset, cluster_labels):
 sample["gmm_cluster_50_gaussians"] = int(label)
 sample.save()

# n_gaussians = 100
# gmm = GaussianMixture(n_components=n_gaussians, random_state=42)
# gmm.fit(embeddings)
# cluster_labels = gmm.predict(embeddings)

# # Adding labeled embeddings to visulization
# dataset.add_sample_field("gmm_cluster", fo.IntField)
# for sample, label in zip(dataset, cluster_labels):
# sample["gmm_cluster_200_gaussians"] = int(label)
# sample.save()

# # --- Visualize the Embeddings with UMAP ---
# # This will compute a 2D representation of your embeddings
# # for visualization.
# res = fob.compute_visualization(
# dataset,
# embeddings=embeddings,
# brain_key="qwen_vision_viz",
# method="tsne",
# verbose=True
# )
# dataset.set_values("qwen_umap", res.current_points)

# print("UMAP visualization computed. Launch the app to see the plot.")
# session = fo.launch_app(dataset)
# session.wait()

 from .autonotebook import tqdm as notebook_tqdm


 100% |███████████████| 1090/1090 [182.4ms elapsed, 0s remaining, 6.0K samples/s] 


In [None]:
from sklearn.metrics import silhouette_samples
print(silhouette_samples(np.array(embeddings), labels=cluster_labels))

[[[ 2.22716806e+06 -1.55841522e+05 1.20176465e+04 ... 7.78593400e+03
 1.90060269e+03 -1.81817591e+02]
 [-1.55841522e+05 2.45425677e+06 9.82699914e+04 ... -5.37744672e+04
 -1.14976214e+05 1.95498686e+05]
 [ 1.20176465e+04 9.82699914e+04 2.05845769e+06 ... -6.42761657e+04
 -1.57428613e+03 1.90527654e+05]
 ...
 [ 7.78593400e+03 -5.37744672e+04 -6.42761657e+04 ... 2.19897397e+06
 9.79439501e+04 2.63393791e+05]
 [ 1.90060269e+03 -1.14976214e+05 -1.57428613e+03 ... 9.79439501e+04
 2.09478614e+06 1.35758244e+05]
 [-1.81817591e+02 1.95498686e+05 1.90527654e+05 ... 2.63393791e+05
 1.35758244e+05 2.12691079e+06]]

 [[ 2.83658946e+06 -1.34780282e+05 6.49580505e+04 ... 8.54367930e+04
 2.68631019e+04 -2.00238083e+03]
 [-1.34780282e+05 3.16043262e+06 1.74730403e+05 ... -1.52450289e+04
 -1.33867983e+05 2.33287605e+05]
 [ 6.49580505e+04 1.74730403e+05 2.62079599e+06 ... -1.34705133e+05
 -3.42729631e+03 2.52694121e+05]
 ...
 [ 8.54367930e+04 -1.52450289e+04 -1.34705133e+05 ... 2.83179172e+06
 1.2070406