embedding-clustering/extract/clustering_example_qwen.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "59f8a415",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-09-02 15:00:12.976185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1756825212.987686 3903757 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1756825212.991038 3903757 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "W0000 00:00:1756825213.000855 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1756825213.000880 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1756825213.000882 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1756825213.000884 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "2025-09-02 15:00:13.005218: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-09-02 15:00:17,970] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/bin/ld: cannot find -laio: No such file or directory\n",
      "collect2: error: ld returned 1 exit status\n",
      "/usr/bin/ld: cannot find -laio: No such file or directory\n",
      "collect2: error: ld returned 1 exit status\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using device: cuda\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]\n",
      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
    "# from qwen_vl_utils import process_vision_info\n",
    "from PIL import Image\n",
    "import os\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "\n",
    "# --- Configuration ---\n",
    "MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\"  # You can choose other model sizes\n",
    "\n",
    "IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",
    "BATCH_SIZE = 4\n",
    "# --- End Configuration ---\n",
    "\n",
    "# Check for GPU availability\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "print(f\"Using device: {device}\")\n",
    "\n",
    "# Load the model and processor\n",
    "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
    "    MODEL_NAME, torch_dtype=\"bfloat16\", device_map=\"cuda\", attn_implementation=\"flash_attention_2\",\n",
    ")\n",
    "processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "13479e1a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Qwen2_5_VLProcessor:\n",
       "- image_processor: Qwen2VLImageProcessor {\n",
       "  \"do_convert_rgb\": true,\n",
       "  \"do_normalize\": true,\n",
       "  \"do_rescale\": true,\n",
       "  \"do_resize\": true,\n",
       "  \"image_mean\": [\n",
       "    0.48145466,\n",
       "    0.4578275,\n",
       "    0.40821073\n",
       "  ],\n",
       "  \"image_processor_type\": \"Qwen2VLImageProcessor\",\n",
       "  \"image_std\": [\n",
       "    0.26862954,\n",
       "    0.26130258,\n",
       "    0.27577711\n",
       "  ],\n",
       "  \"max_pixels\": 12845056,\n",
       "  \"merge_size\": 2,\n",
       "  \"min_pixels\": 3136,\n",
       "  \"patch_size\": 14,\n",
       "  \"processor_class\": \"Qwen2_5_VLProcessor\",\n",
       "  \"resample\": 3,\n",
       "  \"rescale_factor\": 0.00392156862745098,\n",
       "  \"size\": {\n",
       "    \"longest_edge\": 12845056,\n",
       "    \"shortest_edge\": 3136\n",
       "  },\n",
       "  \"temporal_patch_size\": 2\n",
       "}\n",
       "\n",
       "- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
       "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t151657: AddedToken(\"<tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151658: AddedToken(\"</tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
       "}\n",
       ")\n",
       "\n",
       "{\n",
       "  \"processor_class\": \"Qwen2_5_VLProcessor\"\n",
       "}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdfdab0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_image_embeddings(image_paths):\n",
    "    \"\"\"\n",
    "    Processes a batch of images and extracts their embeddings.\n",
    "    \"\"\"\n",
    "    images_pil = []\n",
    "    valid_paths = []\n",
    "    for path in image_paths:\n",
    "        if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
    "            try:\n",
    "                # The processor expects PIL images in RGB format\n",
    "                images_pil.append(Image.open(path).convert(\"RGB\"))\n",
    "                valid_paths.append(path)\n",
    "            except Exception as e:\n",
    "                print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",
    "\n",
    "    if not images_pil:\n",
    "        return np.array([]), []\n",
    "\n",
    "    # For pure vision feature extraction, we can provide an empty text prompt.\n",
    "    # The processor handles tokenizing text and preparing images.\n",
    "    inputs = processor(\n",
    "        text=[\"\"] * len(images_pil),\n",
    "        images=images_pil,\n",
    "        padding=True,\n",
    "        return_tensors=\"pt\"\n",
    "    ).to(device)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        # Get the vision embeddings from the model's vision tower\n",
    "        vision_outputs = model.visual(inputs['pixel_values'].to(dtype=model.dtype), grid_thw=inputs['image_grid_thw'])\n",
    "        # We'll use the pooled output as the embedding\n",
    "        embeddings = vision_outputs\n",
    "\n",
    "    return embeddings.to(torch.float16).cpu().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdaebb7b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 700/700 [22:12<00:00,  1.90s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embeddings extracted and saved.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "# --- Process all images in the directory ---\n",
    "image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
    "all_embeddings = []\n",
    "filepaths = []\n",
    "\n",
    "with open(\"embeddings_factures_osteopathie_1k_qwen.json\", \"w\") as f:\n",
    "\n",
    "    f.write(\"[\\n\")\n",
    "    first = True\n",
    "    for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",
    "        batch_paths = image_files[i:i+BATCH_SIZE]\n",
    "        batch_embeddings = get_image_embeddings(batch_paths)\n",
    "        embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",
    "        for path, emb in zip(batch_paths, embeddings_list):\n",
    "            if not first:\n",
    "                f.write(\",\\n\")\n",
    "            json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",
    "            first = False\n",
    "    f.write(\"\\n]\\n\")\n",
    "\n",
    "print(\"Embeddings extracted and saved.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2c3e6dd0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 2800 samples with embedding dimension 2048\n",
      "Applied L2 normalization to embeddings\n",
      "(2800, 2048)\n",
      "(3918600,)\n",
      "mean sim: 0.37961555 std: 0.22605234\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.metrics import silhouette_score\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "from sklearn.decomposition import PCA\n",
    "import argparse\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from datetime import datetime\n",
    "import json\n",
    "\n",
    "embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json\"\n",
    "with open(embeddings_path, 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "file_paths = []\n",
    "embeddings_list = []\n",
    "\n",
    "for item in data:\n",
    "    file_paths.append(item['filepath'])\n",
    "    embeddings_list.append(item['embedding'])\n",
    "\n",
    "embeddings = np.array(embeddings_list, dtype=np.float32)\n",
    "print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n",
    "\n",
    "# Normalize embeddings using L2 normalization for cosine distance\n",
    "embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n",
    "print(\"Applied L2 normalization to embeddings\")\n",
    "\n",
    "sims = cosine_similarity(embeddings)\n",
    "print(embeddings.shape)\n",
    "# lấy upper triangle exclude diagonal để inspect\n",
    "triu_idxs = np.triu_indices_from(sims, k=1)\n",
    "dist_vals = sims[triu_idxs]\n",
    "print(dist_vals.shape)\n",
    "print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29620d93",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27fea4f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s]   \n"
     ]
    }
   ],
   "source": [
    "import fiftyone as fo\n",
    "import fiftyone.brain as fob\n",
    "import numpy as np\n",
    "from sklearn.mixture import GaussianMixture\n",
    "import json\n",
    "\n",
    "DATASET_NAME = \"mock\"\n",
    "\n",
    "json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n",
    "\n",
    "with open(json_path, \"r\") as file:\n",
    "    embedding_data = json.load(file)\n",
    "\n",
    "file_paths = []\n",
    "embeddings = []\n",
    "for i, record in enumerate(embedding_data):\n",
    "    file_paths.append(record.get(\"filepath\"))\n",
    "    embeddings.append(record.get(\"embedding\"))\n",
    "\n",
    "if DATASET_NAME in fo.list_datasets():\n",
    "    dataset = fo.load_dataset(DATASET_NAME)\n",
    "    dataset.delete()\n",
    "dataset = fo.Dataset(DATASET_NAME)\n",
    "\n",
    "# Add samples to the dataset\n",
    "samples = [fo.Sample(filepath=p) for p in file_paths]\n",
    "dataset.add_samples(samples)\n",
    "\n",
    "# Building Gaussian mixture model (GMM)\n",
    "n_gaussians = 50\n",
    "gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
    "gmm.fit(embeddings)\n",
    "cluster_labels = gmm.predict(embeddings)\n",
    "\n",
    "# Adding labeled embeddings to visulization\n",
    "dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
    "for sample, label in zip(dataset, cluster_labels):\n",
    "    sample[\"gmm_cluster_50_gaussians\"] = int(label)\n",
    "    sample.save()\n",
    "\n",
    "n_gaussians = 200\n",
    "gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
    "gmm.fit(embeddings)\n",
    "cluster_labels = gmm.predict(embeddings)\n",
    "\n",
    "# Adding labeled embeddings to visulization\n",
    "dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
    "for sample, label in zip(dataset, cluster_labels):\n",
    "    sample[\"gmm_cluster_200_gaussians\"] = int(label)\n",
    "    sample.save()\n",
    "\n",
    "# --- Visualize the Embeddings with UMAP ---\n",
    "# This will compute a 2D representation of your embeddings\n",
    "# for visualization.\n",
    "res = fob.compute_visualization(\n",
    "    dataset,\n",
    "    embeddings=embeddings,\n",
    "    brain_key=\"qwen_vision_viz\",\n",
    "    method=\"tsne\",\n",
    "    verbose=True\n",
    ")\n",
    "dataset.set_values(\"qwen_umap\", res.current_points)\n",
    "\n",
    "print(\"UMAP visualization computed. Launch the app to see the plot.\")\n",
    "session = fo.launch_app(dataset)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
check visison extract model 2025-09-02 15:01:50 +00:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "59f8a415",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"2025-09-02 15:00:12.976185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",`
			`"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",`
			`"E0000 00:00:1756825212.987686 3903757 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",`
			`"E0000 00:00:1756825212.991038 3903757 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",`
			`"W0000 00:00:1756825213.000855 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",`
			`"W0000 00:00:1756825213.000880 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",`
			`"W0000 00:00:1756825213.000882 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",`
			`"W0000 00:00:1756825213.000884 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",`
			`"2025-09-02 15:00:13.005218: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",`
			`"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"[2025-09-02 15:00:17,970] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"/usr/bin/ld: cannot find -laio: No such file or directory\n",`
			`"collect2: error: ld returned 1 exit status\n",`
			`"/usr/bin/ld: cannot find -laio: No such file or directory\n",`
			`"collect2: error: ld returned 1 exit status\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Using device: cuda\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"Loading checkpoint shards: 100%\|██████████\| 2/2 [00:01<00:00, 1.09it/s]\n",`
			"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
			`]`
			`}`
			`],`
			`"source": [`
			`"import torch\n",`
			`"from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",`
			`"# from qwen_vl_utils import process_vision_info\n",`
			`"from PIL import Image\n",`
			`"import os\n",`
			`"import numpy as np\n",`
			`"from tqdm import tqdm\n",`
			`"\n",`
			`"# --- Configuration ---\n",`
			`"MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\" # You can choose other model sizes\n",`
			`"\n",`
			`"IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",`
			`"BATCH_SIZE = 4\n",`
			`"# --- End Configuration ---\n",`
			`"\n",`
			`"# Check for GPU availability\n",`
			`"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",`
			`"print(f\"Using device: {device}\")\n",`
			`"\n",`
			`"# Load the model and processor\n",`
			`"model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",`
			`" MODEL_NAME, torch_dtype=\"bfloat16\", device_map=\"cuda\", attn_implementation=\"flash_attention_2\",\n",`
			`")\n",`
			`"processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "13479e1a",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"Qwen2_5_VLProcessor:\n",`
			`"- image_processor: Qwen2VLImageProcessor {\n",`
			`" \"do_convert_rgb\": true,\n",`
			`" \"do_normalize\": true,\n",`
			`" \"do_rescale\": true,\n",`
			`" \"do_resize\": true,\n",`
			`" \"image_mean\": [\n",`
			`" 0.48145466,\n",`
			`" 0.4578275,\n",`
			`" 0.40821073\n",`
			`" ],\n",`
			`" \"image_processor_type\": \"Qwen2VLImageProcessor\",\n",`
			`" \"image_std\": [\n",`
			`" 0.26862954,\n",`
			`" 0.26130258,\n",`
			`" 0.27577711\n",`
			`" ],\n",`
			`" \"max_pixels\": 12845056,\n",`
			`" \"merge_size\": 2,\n",`
			`" \"min_pixels\": 3136,\n",`
			`" \"patch_size\": 14,\n",`
			`" \"processor_class\": \"Qwen2_5_VLProcessor\",\n",`
			`" \"resample\": 3,\n",`
			`" \"rescale_factor\": 0.00392156862745098,\n",`
			`" \"size\": {\n",`
			`" \"longest_edge\": 12845056,\n",`
			`" \"shortest_edge\": 3136\n",`
			`" },\n",`
			`" \"temporal_patch_size\": 2\n",`
			`"}\n",`
			`"\n",`
			"- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<\|im_end\|>', 'pad_token': '<\|endoftext\|>', 'additional_special_tokens': ['<\|im_start\|>', '<\|im_end\|>', '<\|object_ref_start\|>', '<\|object_ref_end\|>', '<\|box_start\|>', '<\|box_end\|>', '<\|quad_start\|>', '<\|quad_end\|>', '<\|vision_start\|>', '<\|vision_end\|>', '<\|vision_pad\|>', '<\|image_pad\|>', '<\|video_pad\|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
			`"\t151643: AddedToken(\"<\|endoftext\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151644: AddedToken(\"<\|im_start\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151645: AddedToken(\"<\|im_end\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151646: AddedToken(\"<\|object_ref_start\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151647: AddedToken(\"<\|object_ref_end\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151648: AddedToken(\"<\|box_start\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151649: AddedToken(\"<\|box_end\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151650: AddedToken(\"<\|quad_start\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151651: AddedToken(\"<\|quad_end\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151652: AddedToken(\"<\|vision_start\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151653: AddedToken(\"<\|vision_end\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151654: AddedToken(\"<\|vision_pad\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151655: AddedToken(\"<\|image_pad\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151656: AddedToken(\"<\|video_pad\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",`
			`"\t151657: AddedToken(\"<tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151658: AddedToken(\"</tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151659: AddedToken(\"<\|fim_prefix\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151660: AddedToken(\"<\|fim_middle\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151661: AddedToken(\"<\|fim_suffix\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151662: AddedToken(\"<\|fim_pad\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151663: AddedToken(\"<\|repo_name\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"\t151664: AddedToken(\"<\|file_sep\|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",`
			`"}\n",`
			`")\n",`
			`"\n",`
			`"{\n",`
			`" \"processor_class\": \"Qwen2_5_VLProcessor\"\n",`
			`"}"`
			`]`
			`},`
			`"execution_count": 2,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"processor"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "cdfdab0e",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def get_image_embeddings(image_paths):\n",`
			`" \"\"\"\n",`
			`" Processes a batch of images and extracts their embeddings.\n",`
			`" \"\"\"\n",`
			`" images_pil = []\n",`
			`" valid_paths = []\n",`
			`" for path in image_paths:\n",`
			`" if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",`
			`" try:\n",`
			`" # The processor expects PIL images in RGB format\n",`
			`" images_pil.append(Image.open(path).convert(\"RGB\"))\n",`
			`" valid_paths.append(path)\n",`
			`" except Exception as e:\n",`
			`" print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",`
			`"\n",`
			`" if not images_pil:\n",`
			`" return np.array([]), []\n",`
			`"\n",`
			`" # For pure vision feature extraction, we can provide an empty text prompt.\n",`
			`" # The processor handles tokenizing text and preparing images.\n",`
			`" inputs = processor(\n",`
			`" text=[\"\"] * len(images_pil),\n",`
			`" images=images_pil,\n",`
			`" padding=True,\n",`
			`" return_tensors=\"pt\"\n",`
			`" ).to(device)\n",`
			`"\n",`
			`" with torch.no_grad():\n",`
			`" # Get the vision embeddings from the model's vision tower\n",`
			`" vision_outputs = model.visual(inputs['pixel_values'].to(dtype=model.dtype), grid_thw=inputs['image_grid_thw'])\n",`
			`" # We'll use the pooled output as the embedding\n",`
			`" embeddings = vision_outputs\n",`
			`"\n",`
			`" return embeddings.to(torch.float16).cpu().numpy()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "cdaebb7b",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"100%\|██████████\| 700/700 [22:12<00:00, 1.90s/it]"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Embeddings extracted and saved.\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"import json\n",`
			`"\n",`
			`"# --- Process all images in the directory ---\n",`
			`"image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",`
			`"all_embeddings = []\n",`
			`"filepaths = []\n",`
			`"\n",`
			`"with open(\"embeddings_factures_osteopathie_1k_qwen.json\", \"w\") as f:\n",`
			`"\n",`
			`" f.write(\"[\\n\")\n",`
			`" first = True\n",`
			`" for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",`
			`" batch_paths = image_files[i:i+BATCH_SIZE]\n",`
			`" batch_embeddings = get_image_embeddings(batch_paths)\n",`
			`" embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",`
			`" for path, emb in zip(batch_paths, embeddings_list):\n",`
			`" if not first:\n",`
			`" f.write(\",\\n\")\n",`
			`" json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",`
			`" first = False\n",`
			`" f.write(\"\\n]\\n\")\n",`
			`"\n",`
			`"print(\"Embeddings extracted and saved.\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"id": "2c3e6dd0",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Loaded 2800 samples with embedding dimension 2048\n",`
			`"Applied L2 normalization to embeddings\n",`
			`"(2800, 2048)\n",`
			`"(3918600,)\n",`
			`"mean sim: 0.37961555 std: 0.22605234\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n",`
			`"from sklearn.preprocessing import normalize\n",`
			`"from sklearn.metrics import silhouette_score\n",`
			`"from sklearn.neighbors import NearestNeighbors\n",`
			`"from sklearn.decomposition import PCA\n",`
			`"import argparse\n",`
			`"import warnings\n",`
			`"warnings.filterwarnings('ignore')\n",`
			`"from sklearn.metrics.pairwise import cosine_similarity\n",`
			`"from datetime import datetime\n",`
			`"import json\n",`
			`"\n",`
			`"embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json\"\n",`
			`"with open(embeddings_path, 'r') as f:\n",`
			`" data = json.load(f)\n",`
			`"\n",`
			`"file_paths = []\n",`
			`"embeddings_list = []\n",`
			`"\n",`
			`"for item in data:\n",`
			`" file_paths.append(item['filepath'])\n",`
			`" embeddings_list.append(item['embedding'])\n",`
			`"\n",`
			`"embeddings = np.array(embeddings_list, dtype=np.float32)\n",`
			`"print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n",`
			`"\n",`
			`"# Normalize embeddings using L2 normalization for cosine distance\n",`
			`"embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n",`
			`"print(\"Applied L2 normalization to embeddings\")\n",`
			`"\n",`
			`"sims = cosine_similarity(embeddings)\n",`
			`"print(embeddings.shape)\n",`
			`"# lấy upper triangle exclude diagonal để inspect\n",`
			`"triu_idxs = np.triu_indices_from(sims, k=1)\n",`
			`"dist_vals = sims[triu_idxs]\n",`
			`"print(dist_vals.shape)\n",`
			`"print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "29620d93",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "27fea4f3",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`" 100% \|███████████████\| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s] \n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"import fiftyone as fo\n",`
			`"import fiftyone.brain as fob\n",`
			`"import numpy as np\n",`
			`"from sklearn.mixture import GaussianMixture\n",`
			`"import json\n",`
			`"\n",`
			`"DATASET_NAME = \"mock\"\n",`
			`"\n",`
			`"json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n",`
			`"\n",`
			`"with open(json_path, \"r\") as file:\n",`
			`" embedding_data = json.load(file)\n",`
			`"\n",`
			`"file_paths = []\n",`
			`"embeddings = []\n",`
			`"for i, record in enumerate(embedding_data):\n",`
			`" file_paths.append(record.get(\"filepath\"))\n",`
			`" embeddings.append(record.get(\"embedding\"))\n",`
			`"\n",`
			`"if DATASET_NAME in fo.list_datasets():\n",`
			`" dataset = fo.load_dataset(DATASET_NAME)\n",`
			`" dataset.delete()\n",`
			`"dataset = fo.Dataset(DATASET_NAME)\n",`
			`"\n",`
			`"# Add samples to the dataset\n",`
			`"samples = [fo.Sample(filepath=p) for p in file_paths]\n",`
			`"dataset.add_samples(samples)\n",`
			`"\n",`
			`"# Building Gaussian mixture model (GMM)\n",`
			`"n_gaussians = 50\n",`
			`"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",`
			`"gmm.fit(embeddings)\n",`
			`"cluster_labels = gmm.predict(embeddings)\n",`
			`"\n",`
			`"# Adding labeled embeddings to visulization\n",`
			`"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",`
			`"for sample, label in zip(dataset, cluster_labels):\n",`
			`" sample[\"gmm_cluster_50_gaussians\"] = int(label)\n",`
			`" sample.save()\n",`
			`"\n",`
			`"n_gaussians = 200\n",`
			`"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",`
			`"gmm.fit(embeddings)\n",`
			`"cluster_labels = gmm.predict(embeddings)\n",`
			`"\n",`
			`"# Adding labeled embeddings to visulization\n",`
			`"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",`
			`"for sample, label in zip(dataset, cluster_labels):\n",`
			`" sample[\"gmm_cluster_200_gaussians\"] = int(label)\n",`
			`" sample.save()\n",`
			`"\n",`
			`"# --- Visualize the Embeddings with UMAP ---\n",`
			`"# This will compute a 2D representation of your embeddings\n",`
			`"# for visualization.\n",`
			`"res = fob.compute_visualization(\n",`
			`" dataset,\n",`
			`" embeddings=embeddings,\n",`
			`" brain_key=\"qwen_vision_viz\",\n",`
			`" method=\"tsne\",\n",`
			`" verbose=True\n",`
			`")\n",`
			`"dataset.set_values(\"qwen_umap\", res.current_points)\n",`
			`"\n",`
			`"print(\"UMAP visualization computed. Launch the app to see the plot.\")\n",`
			`"session = fo.launch_app(dataset)"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "venv",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.12"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`