Files
embedding-clustering/extract/clustering_example_qwen.ipynb

433 lines
18 KiB
Plaintext
Raw Normal View History

2025-09-02 15:01:50 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "59f8a415",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-09-02 15:00:12.976185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1756825212.987686 3903757 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1756825212.991038 3903757 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1756825213.000855 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1756825213.000880 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1756825213.000882 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1756825213.000884 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-09-02 15:00:13.005218: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2025-09-02 15:00:17,970] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
"collect2: error: ld returned 1 exit status\n",
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
"collect2: error: ld returned 1 exit status\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.09it/s]\n",
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
]
}
],
"source": [
"import torch\n",
"from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
"# from qwen_vl_utils import process_vision_info\n",
"from PIL import Image\n",
"import os\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"\n",
"# --- Configuration ---\n",
"MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\" # You can choose other model sizes\n",
"\n",
"IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",
"BATCH_SIZE = 4\n",
"# --- End Configuration ---\n",
"\n",
"# Check for GPU availability\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"print(f\"Using device: {device}\")\n",
"\n",
"# Load the model and processor\n",
"model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
" MODEL_NAME, torch_dtype=\"bfloat16\", device_map=\"cuda\", attn_implementation=\"flash_attention_2\",\n",
")\n",
"processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "13479e1a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2_5_VLProcessor:\n",
"- image_processor: Qwen2VLImageProcessor {\n",
" \"do_convert_rgb\": true,\n",
" \"do_normalize\": true,\n",
" \"do_rescale\": true,\n",
" \"do_resize\": true,\n",
" \"image_mean\": [\n",
" 0.48145466,\n",
" 0.4578275,\n",
" 0.40821073\n",
" ],\n",
" \"image_processor_type\": \"Qwen2VLImageProcessor\",\n",
" \"image_std\": [\n",
" 0.26862954,\n",
" 0.26130258,\n",
" 0.27577711\n",
" ],\n",
" \"max_pixels\": 12845056,\n",
" \"merge_size\": 2,\n",
" \"min_pixels\": 3136,\n",
" \"patch_size\": 14,\n",
" \"processor_class\": \"Qwen2_5_VLProcessor\",\n",
" \"resample\": 3,\n",
" \"rescale_factor\": 0.00392156862745098,\n",
" \"size\": {\n",
" \"longest_edge\": 12845056,\n",
" \"shortest_edge\": 3136\n",
" },\n",
" \"temporal_patch_size\": 2\n",
"}\n",
"\n",
"- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
"\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151657: AddedToken(\"<tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151658: AddedToken(\"</tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"}\n",
")\n",
"\n",
"{\n",
" \"processor_class\": \"Qwen2_5_VLProcessor\"\n",
"}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processor"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdfdab0e",
"metadata": {},
"outputs": [],
"source": [
"def get_image_embeddings(image_paths):\n",
" \"\"\"\n",
" Processes a batch of images and extracts their embeddings.\n",
" \"\"\"\n",
" images_pil = []\n",
" valid_paths = []\n",
" for path in image_paths:\n",
" if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
" try:\n",
" # The processor expects PIL images in RGB format\n",
" images_pil.append(Image.open(path).convert(\"RGB\"))\n",
" valid_paths.append(path)\n",
" except Exception as e:\n",
" print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",
"\n",
" if not images_pil:\n",
" return np.array([]), []\n",
"\n",
" # For pure vision feature extraction, we can provide an empty text prompt.\n",
" # The processor handles tokenizing text and preparing images.\n",
" inputs = processor(\n",
" text=[\"\"] * len(images_pil),\n",
" images=images_pil,\n",
" padding=True,\n",
" return_tensors=\"pt\"\n",
" ).to(device)\n",
"\n",
" with torch.no_grad():\n",
" # Get the vision embeddings from the model's vision tower\n",
" vision_outputs = model.visual(inputs['pixel_values'].to(dtype=model.dtype), grid_thw=inputs['image_grid_thw'])\n",
" # We'll use the pooled output as the embedding\n",
" embeddings = vision_outputs\n",
"\n",
" return embeddings.to(torch.float16).cpu().numpy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdaebb7b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 700/700 [22:12<00:00, 1.90s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embeddings extracted and saved.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import json\n",
"\n",
"# --- Process all images in the directory ---\n",
"image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
"all_embeddings = []\n",
"filepaths = []\n",
"\n",
"with open(\"embeddings_factures_osteopathie_1k_qwen.json\", \"w\") as f:\n",
"\n",
" f.write(\"[\\n\")\n",
" first = True\n",
" for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",
" batch_paths = image_files[i:i+BATCH_SIZE]\n",
" batch_embeddings = get_image_embeddings(batch_paths)\n",
" embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",
" for path, emb in zip(batch_paths, embeddings_list):\n",
" if not first:\n",
" f.write(\",\\n\")\n",
" json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",
" first = False\n",
" f.write(\"\\n]\\n\")\n",
"\n",
"print(\"Embeddings extracted and saved.\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2c3e6dd0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 2800 samples with embedding dimension 2048\n",
"Applied L2 normalization to embeddings\n",
"(2800, 2048)\n",
"(3918600,)\n",
"mean sim: 0.37961555 std: 0.22605234\n"
]
}
],
"source": [
"from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n",
"from sklearn.preprocessing import normalize\n",
"from sklearn.metrics import silhouette_score\n",
"from sklearn.neighbors import NearestNeighbors\n",
"from sklearn.decomposition import PCA\n",
"import argparse\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from datetime import datetime\n",
"import json\n",
"\n",
"embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json\"\n",
"with open(embeddings_path, 'r') as f:\n",
" data = json.load(f)\n",
"\n",
"file_paths = []\n",
"embeddings_list = []\n",
"\n",
"for item in data:\n",
" file_paths.append(item['filepath'])\n",
" embeddings_list.append(item['embedding'])\n",
"\n",
"embeddings = np.array(embeddings_list, dtype=np.float32)\n",
"print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n",
"\n",
"# Normalize embeddings using L2 normalization for cosine distance\n",
"embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n",
"print(\"Applied L2 normalization to embeddings\")\n",
"\n",
"sims = cosine_similarity(embeddings)\n",
"print(embeddings.shape)\n",
"# lấy upper triangle exclude diagonal để inspect\n",
"triu_idxs = np.triu_indices_from(sims, k=1)\n",
"dist_vals = sims[triu_idxs]\n",
"print(dist_vals.shape)\n",
"print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29620d93",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "27fea4f3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s] \n"
]
}
],
"source": [
"import fiftyone as fo\n",
"import fiftyone.brain as fob\n",
"import numpy as np\n",
"from sklearn.mixture import GaussianMixture\n",
"import json\n",
"\n",
"DATASET_NAME = \"mock\"\n",
"\n",
"json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n",
"\n",
"with open(json_path, \"r\") as file:\n",
" embedding_data = json.load(file)\n",
"\n",
"file_paths = []\n",
"embeddings = []\n",
"for i, record in enumerate(embedding_data):\n",
" file_paths.append(record.get(\"filepath\"))\n",
" embeddings.append(record.get(\"embedding\"))\n",
"\n",
"if DATASET_NAME in fo.list_datasets():\n",
" dataset = fo.load_dataset(DATASET_NAME)\n",
" dataset.delete()\n",
"dataset = fo.Dataset(DATASET_NAME)\n",
"\n",
"# Add samples to the dataset\n",
"samples = [fo.Sample(filepath=p) for p in file_paths]\n",
"dataset.add_samples(samples)\n",
"\n",
"# Building Gaussian mixture model (GMM)\n",
"n_gaussians = 50\n",
"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
"gmm.fit(embeddings)\n",
"cluster_labels = gmm.predict(embeddings)\n",
"\n",
"# Adding labeled embeddings to visulization\n",
"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
"for sample, label in zip(dataset, cluster_labels):\n",
" sample[\"gmm_cluster_50_gaussians\"] = int(label)\n",
" sample.save()\n",
"\n",
"n_gaussians = 200\n",
"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
"gmm.fit(embeddings)\n",
"cluster_labels = gmm.predict(embeddings)\n",
"\n",
"# Adding labeled embeddings to visulization\n",
"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
"for sample, label in zip(dataset, cluster_labels):\n",
" sample[\"gmm_cluster_200_gaussians\"] = int(label)\n",
" sample.save()\n",
"\n",
"# --- Visualize the Embeddings with UMAP ---\n",
"# This will compute a 2D representation of your embeddings\n",
"# for visualization.\n",
"res = fob.compute_visualization(\n",
" dataset,\n",
" embeddings=embeddings,\n",
" brain_key=\"qwen_vision_viz\",\n",
" method=\"tsne\",\n",
" verbose=True\n",
")\n",
"dataset.set_values(\"qwen_umap\", res.current_points)\n",
"\n",
"print(\"UMAP visualization computed. Launch the app to see the plot.\")\n",
"session = fo.launch_app(dataset)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}