Files
embedding-clustering/extract/clustering_example_InternVL3_5.ipynb

875 lines
34 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "59f8a415",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nguyendc/sonnh/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-09-02 13:50:30.358544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1756821030.369428 3858431 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1756821030.372761 3858431 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1756821030.382108 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1756821030.382119 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1756821030.382121 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1756821030.382123 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-09-02 13:50:30.385619: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2025-09-02 13:50:35,304] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
"collect2: error: ld returned 1 exit status\n",
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
"collect2: error: ld returned 1 exit status\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
"- configuration_intern_vit.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
"- configuration_internvl_chat.py\n",
"- configuration_intern_vit.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
"- conversation.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
"- modeling_intern_vit.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
"- modeling_internvl_chat.py\n",
"- conversation.py\n",
"- modeling_intern_vit.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
"Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 7.56it/s]\n",
"Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00, 1.03s/it]\n"
]
}
],
"source": [
"import torch\n",
"from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel, InternVLChatModel\n",
"# from qwen_vl_utils import process_vision_info\n",
"from PIL import Image\n",
"import os\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"import math\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModel\n",
"import timm\n",
"\n",
"# --- Configuration ---\n",
"# MODEL_NAME = \"OpenGVLab/InternVL3_5-4B\" # You can choose other model sizes\n",
"MODEL_NAME = \"OpenGVLab/InternVL3_5-4B-Instruct\"\n",
"\n",
"\n",
"IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",
"BATCH_SIZE = 4\n",
"# --- End Configuration ---\n",
"\n",
"# Check for GPU availability\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"print(f\"Using device: {device}\")\n",
"\n",
"# Load the model and processor\n",
"\n",
"model = AutoModel.from_pretrained(\n",
" MODEL_NAME,\n",
" torch_dtype=torch.bfloat16,\n",
" use_flash_attn=True,\n",
" attn_implementation=\"flash_attention_2\",\n",
" trust_remote_code=True,\n",
" device_map=\"cuda\").eval()\n",
"\n",
"processor = AutoProcessor.from_pretrained(\n",
" MODEL_NAME, \n",
" trust_remote_code=True\n",
" )\n",
"tokenizer = AutoTokenizer.from_pretrained(\n",
" MODEL_NAME, \n",
" trust_remote_code=True, \n",
" use_fast=False\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6d826d19",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"InternVLChatModel(\n",
" (vision_model): InternVisionModel(\n",
" (embeddings): InternVisionEmbeddings(\n",
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n",
" )\n",
" (encoder): InternVisionEncoder(\n",
" (layers): ModuleList(\n",
" (0-23): 24 x InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): Identity()\n",
" (drop_path2): Identity()\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (language_model): Qwen3ForCausalLM(\n",
" (model): Qwen3Model(\n",
" (embed_tokens): Embedding(151936, 2560)\n",
" (layers): ModuleList(\n",
" (0-35): 36 x Qwen3DecoderLayer(\n",
" (self_attn): Qwen3Attention(\n",
" (q_proj): Linear(in_features=2560, out_features=4096, bias=False)\n",
" (k_proj): Linear(in_features=2560, out_features=1024, bias=False)\n",
" (v_proj): Linear(in_features=2560, out_features=1024, bias=False)\n",
" (o_proj): Linear(in_features=4096, out_features=2560, bias=False)\n",
" (q_norm): Qwen3RMSNorm((128,), eps=1e-06)\n",
" (k_norm): Qwen3RMSNorm((128,), eps=1e-06)\n",
" )\n",
" (mlp): Qwen3MLP(\n",
" (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)\n",
" (up_proj): Linear(in_features=2560, out_features=9728, bias=False)\n",
" (down_proj): Linear(in_features=9728, out_features=2560, bias=False)\n",
" (act_fn): SiLU()\n",
" )\n",
" (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
" (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
" )\n",
" )\n",
" (norm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
" (rotary_emb): Qwen3RotaryEmbedding()\n",
" )\n",
" (lm_head): Linear(in_features=2560, out_features=151936, bias=False)\n",
" )\n",
" (mlp1): Sequential(\n",
" (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
" (1): Linear(in_features=4096, out_features=2560, bias=True)\n",
" (2): GELU(approximate='none')\n",
" (3): Linear(in_features=2560, out_features=2560, bias=True)\n",
" )\n",
")"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7bbfcf47",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"InternVisionModel(\n",
" (embeddings): InternVisionEmbeddings(\n",
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n",
" )\n",
" (encoder): InternVisionEncoder(\n",
" (layers): ModuleList(\n",
" (0-23): 24 x InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): Identity()\n",
" (drop_path2): Identity()\n",
" )\n",
" )\n",
" )\n",
")"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.vision_model"
]
},
{
"cell_type": "markdown",
"id": "ae26d6cf",
"metadata": {},
"source": [
"# demo ?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "817d3ccb",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d41f94bd",
"metadata": {},
"outputs": [],
"source": [
"# import math\n",
"import numpy as np\n",
"import torch\n",
"import torchvision.transforms as T\n",
"# from decord import VideoReader, cpu\n",
"from PIL import Image\n",
"from torchvision.transforms.functional import InterpolationMode\n",
"# from modelscope import AutoModel, AutoTokenizer\n",
"\n",
"IMAGENET_MEAN = (0.485, 0.456, 0.406)\n",
"IMAGENET_STD = (0.229, 0.224, 0.225)\n",
"\n",
"def build_transform(input_size):\n",
" MEAN, STD = IMAGENET_MEAN, IMAGENET_STD\n",
" transform = T.Compose([\n",
" T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),\n",
" T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),\n",
" T.ToTensor(),\n",
" T.Normalize(mean=MEAN, std=STD)\n",
" ])\n",
" return transform\n",
"\n",
"def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):\n",
" best_ratio_diff = float('inf')\n",
" best_ratio = (1, 1)\n",
" area = width * height\n",
" for ratio in target_ratios:\n",
" target_aspect_ratio = ratio[0] / ratio[1]\n",
" ratio_diff = abs(aspect_ratio - target_aspect_ratio)\n",
" if ratio_diff < best_ratio_diff:\n",
" best_ratio_diff = ratio_diff\n",
" best_ratio = ratio\n",
" elif ratio_diff == best_ratio_diff:\n",
" if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:\n",
" best_ratio = ratio\n",
" return best_ratio\n",
"\n",
"def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):\n",
" orig_width, orig_height = image.size\n",
" aspect_ratio = orig_width / orig_height\n",
"\n",
" # calculate the existing image aspect ratio\n",
" target_ratios = set(\n",
" (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if\n",
" i * j <= max_num and i * j >= min_num)\n",
" target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])\n",
"\n",
" # find the closest aspect ratio to the target\n",
" target_aspect_ratio = find_closest_aspect_ratio(\n",
" aspect_ratio, target_ratios, orig_width, orig_height, image_size)\n",
"\n",
" # calculate the target width and height\n",
" target_width = image_size * target_aspect_ratio[0]\n",
" target_height = image_size * target_aspect_ratio[1]\n",
" blocks = target_aspect_ratio[0] * target_aspect_ratio[1]\n",
"\n",
" # resize the image\n",
" resized_img = image.resize((target_width, target_height))\n",
" processed_images = []\n",
" for i in range(blocks):\n",
" box = (\n",
" (i % (target_width // image_size)) * image_size,\n",
" (i // (target_width // image_size)) * image_size,\n",
" ((i % (target_width // image_size)) + 1) * image_size,\n",
" ((i // (target_width // image_size)) + 1) * image_size\n",
" )\n",
" # split the image\n",
" split_img = resized_img.crop(box)\n",
" processed_images.append(split_img)\n",
" assert len(processed_images) == blocks\n",
" if use_thumbnail and len(processed_images) != 1:\n",
" thumbnail_img = image.resize((image_size, image_size))\n",
" processed_images.append(thumbnail_img)\n",
" return processed_images\n",
"\n",
"def load_image(image_file, input_size=448, max_num=12):\n",
" image = Image.open(image_file).convert('RGB')\n",
" transform = build_transform(input_size=input_size)\n",
" images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)\n",
" pixel_values = [transform(image) for image in images]\n",
" pixel_values = torch.stack(pixel_values)\n",
" return pixel_values"
]
},
{
"cell_type": "markdown",
"id": "f2ec71a4",
"metadata": {},
"source": [
"# Attention pooling\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a404fa19",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn.functional as F\n",
"\n",
"def gem_pool(x, p: float = 3.0, eps: float = 1e-6):\n",
" # x: [B, N, D]\n",
" return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)\n",
"\n",
"@torch.no_grad()\n",
"def image_embedding(pixel_values, model, use_tiling=True):\n",
" # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]\n",
" out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)\n",
" tok = out.last_hidden_state # [T, N, 1024] hoặc [1, N, 1024]\n",
" if tok.dim() == 2: # phòng trường hợp model trả [N, D]\n",
" tok = tok.unsqueeze(0)\n",
"\n",
" # 1) Attention pooling theo token, trong từng tile\n",
" w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1) # [T,N,1]\n",
" attn_tile = (tok * w_tok).sum(dim=1) # [T,1024]\n",
"\n",
" # 2) Các pooling khác theo token\n",
" mean_tile = tok.mean(dim=1) # [T,1024]\n",
" max_tile = tok.max(dim=1).values # [T,1024]\n",
" gem_tile = gem_pool(tok, p=3.0) # [T,1024]\n",
"\n",
" # 3) Attention across-tiles (giữ multi-scale nhưng gọn)\n",
" tile_scores = attn_tile.norm(dim=-1) # [T]\n",
" w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1) # [T,1]\n",
"\n",
" mean_vec = (mean_tile * w_tile).sum(dim=0)\n",
" max_vec = (max_tile * w_tile).sum(dim=0)\n",
" gem_vec = (gem_tile * w_tile).sum(dim=0)\n",
" attn_vec = (attn_tile * w_tile).sum(dim=0)\n",
"\n",
" # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin\n",
" one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0) # [4*1024]\n",
" one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0) # [1, 4096]\n",
" return one_vec.half() # FP16 để tiết kiệm bộ nhớ"
]
},
{
"cell_type": "markdown",
"id": "ed35a4ce",
"metadata": {},
"source": [
"# pool"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3edf8b67",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn.functional as F\n",
"\n",
"# --- Pooling theo token (trong 1 tile) ---\n",
"def _pool_tokens(tokens: torch.Tensor, how: str = \"mean\") -> torch.Tensor:\n",
" \"\"\"\n",
" tokens: [1, N, D] hoặc [N, D]\n",
" return: [D]\n",
" \"\"\"\n",
" if tokens.dim() == 3: # [1, N, D] -> [N, D]\n",
" tokens = tokens.squeeze(0)\n",
"\n",
" if how == \"mean\":\n",
" v = tokens.mean(dim=0)\n",
" elif how == \"max\":\n",
" v = tokens.max(dim=0).values\n",
" elif how == \"gem\":\n",
" p = 3.0\n",
" v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)\n",
" elif how == \"cls\":\n",
" # chỉ dùng nếu backbone có CLS token ở vị trí đầu\n",
" v = tokens[0]\n",
" else:\n",
" raise ValueError(f\"Unknown pooling: {how}\")\n",
"\n",
" return v\n",
"\n",
"\n",
"@torch.no_grad()\n",
"def image_embedding_global(model, pixel_values: torch.Tensor,\n",
" pool: str = \"mean\",\n",
" normalize: bool = False,\n",
" global_index: int = 0,\n",
" use_projector: bool = False) -> torch.Tensor:\n",
" \"\"\"\n",
" Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.\n",
" - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]\n",
" - global_index: thường = 0 (tile toàn ảnh nằm đầu)\n",
" - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)\n",
" \"\"\"\n",
" model.eval()\n",
" device = next(model.parameters()).device\n",
" x = pixel_values.to(device)\n",
"\n",
" out = model.vision_model(pixel_values=x) # last_hidden_state: [T, N, D] hoặc [1, N, D]\n",
" tok = out.last_hidden_state\n",
"\n",
" # chọn global tile\n",
" if tok.size(0) > 1:\n",
" tok = tok[global_index:global_index+1] # [1, N, D]\n",
"\n",
" # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!\n",
" if use_projector:\n",
" # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)\n",
" in_feat = getattr(model.mlp1[1], \"in_features\", None)\n",
" if in_feat is not None and tok.size(-1) == in_feat:\n",
" tok = model.mlp1(tok) # [1, N, D]\n",
" else:\n",
" raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n",
"\n",
" v = _pool_tokens(tok, how=pool) # [D]\n",
" if normalize:\n",
" v = F.normalize(v, dim=-1)\n",
" return v.unsqueeze(0) # [1, D]\n",
"\n",
"\n",
"@torch.no_grad()\n",
"def image_embedding_mean(model, pixel_values: torch.Tensor,\n",
" pool: str = \"mean\",\n",
" normalize: bool = True,\n",
" use_projector: bool = False) -> torch.Tensor:\n",
" \"\"\"\n",
" Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:\n",
" (1) pool theo token trong từng tile → [T, D]\n",
" (2) lấy mean across-tiles → [D]\n",
" \"\"\"\n",
" model.eval()\n",
" device = next(model.parameters()).device\n",
" x = pixel_values.to(device)\n",
"\n",
" out = model.vision_model(pixel_values=x)\n",
" tok = out.last_hidden_state # [T, N, D] hoặc [1, N, D]\n",
"\n",
" if use_projector:\n",
" in_feat = getattr(model.mlp1[1], \"in_features\", None)\n",
" if in_feat is not None and tok.size(-1) == in_feat:\n",
" tok = model.mlp1(tok)\n",
" else:\n",
" raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n",
"\n",
" # pool theo token trong từng tile\n",
" T = tok.size(0)\n",
" per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ] # list of [D]\n",
" per_tile = torch.stack(per_tile, dim=0) # [T, D]\n",
"\n",
" # mean across-tiles\n",
" v = per_tile.mean(dim=0) # [D]\n",
" if normalize:\n",
" v = F.normalize(v, dim=-1)\n",
" return v.unsqueeze(0) # [1, D]\n"
]
},
{
"cell_type": "markdown",
"id": "613cf001",
"metadata": {},
"source": [
"# infer"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "cdfdab0e",
"metadata": {},
"outputs": [],
"source": [
"def get_image_embedding(path):\n",
" \"\"\"\n",
" Processes a batch of images and extracts their embeddings.\n",
" \"\"\"\n",
" images_pil = []\n",
" valid_paths = []\n",
" if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
" try:\n",
" # The processor expects PIL images in RGB format\n",
" # images_pil.append(Image.open(path).convert(\"RGB\"))\n",
" # print(path)\n",
" valid_paths.append(path)\n",
" except Exception as e:\n",
" print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",
"\n",
" if not valid_paths:\n",
" return np.array([]), []\n",
"\n",
" all_pixel_values = []\n",
" for valid_path in valid_paths:\n",
" pixel_values = load_image(valid_path, max_num=12).to(torch.bfloat16).cuda()\n",
" # print(pixel_values.shape)\n",
" all_pixel_values.append(pixel_values)\n",
" # For pure vision feature extraction, we can provide an empty text prompt.\n",
" # The processor handles tokenizing text and preparing images.\n",
" inputs = torch.cat(all_pixel_values, dim=0).to(device)\n",
" \n",
" # embeddings = image_embedding(inputs, model, use_tiling=True)\n",
" embeddings = image_embedding_mean(model, inputs)\n",
" \n",
" return embeddings.to(torch.float16).cpu().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cdaebb7b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2800/2800 [20:51<00:00, 2.24it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embeddings extracted and saved.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"import json\n",
"\n",
"# --- Process all images in the directory ---\n",
"image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
"all_embeddings = []\n",
"filepaths = []\n",
"BATCH_SIZE = 1\n",
"\n",
"with open(\"embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\", \"w\") as f:\n",
"\n",
" f.write(\"[\\n\")\n",
" first = True\n",
" for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",
" batch_paths = image_files[i]\n",
" batch_embeddings = get_image_embedding(batch_paths)\n",
" embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",
" for path, emb in zip(batch_paths, embeddings_list):\n",
" if not first:\n",
" f.write(\",\\n\")\n",
" json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",
" first = False\n",
" f.write(\"\\n]\\n\")\n",
"\n",
"print(\"Embeddings extracted and saved.\")"
]
},
{
"cell_type": "markdown",
"id": "f0d0bf0a",
"metadata": {},
"source": [
"# check"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0772fc89",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 2800 samples with embedding dimension 1024\n",
"Applied L2 normalization to embeddings\n",
"(2800, 1024)\n",
"(3918600,)\n",
"mean sim: 0.9939966 std: 0.0073577887\n"
]
}
],
"source": [
"from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n",
"from sklearn.preprocessing import normalize\n",
"from sklearn.metrics import silhouette_score\n",
"from sklearn.neighbors import NearestNeighbors\n",
"from sklearn.decomposition import PCA\n",
"import argparse\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from datetime import datetime\n",
"\n",
"\n",
"embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\"\n",
"with open(embeddings_path, 'r') as f:\n",
" data = json.load(f)\n",
"\n",
"file_paths = []\n",
"embeddings_list = []\n",
"\n",
"for item in data:\n",
" file_paths.append(item['filepath'])\n",
" embeddings_list.append(item['embedding'])\n",
"\n",
"embeddings = np.array(embeddings_list, dtype=np.float32)\n",
"print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n",
"\n",
"# Normalize embeddings using L2 normalization for cosine distance\n",
"embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n",
"print(\"Applied L2 normalization to embeddings\")\n",
"\n",
"sims = cosine_similarity(embeddings)\n",
"print(embeddings.shape)\n",
"# lấy upper triangle exclude diagonal để inspect\n",
"triu_idxs = np.triu_indices_from(sims, k=1)\n",
"dist_vals = sims[triu_idxs]\n",
"print(dist_vals.shape)\n",
"print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"
]
},
{
"cell_type": "markdown",
"id": "cb4ea42b",
"metadata": {},
"source": [
"# temp"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2c3e6dd0",
"metadata": {},
"outputs": [],
"source": [
"image_path = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "29620d93",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"User: <image>\n",
"Please describe the image shortly.\n",
"Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address \"104, cours des fossés, 33210 Langon\" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.\n",
"User: <image>\n",
"Please describe the image in detail.\n",
"Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:\n",
"\n",
"- **Name:** Noël Breignaud\n",
"- **Profession:** Ouestopathe (Osteopath)\n",
"- **Address:** 104, cours des fossés, 33210 Lagnon\n",
"- **Phone Number:** Tel. 06 88 70 66 43\n",
"\n",
"On the right side, there are registration and identification numbers:\n",
"\n",
"- **Nº SIRET:** 510 123 631 00010\n",
"- **Nº ADELI:** 330001108\n",
"- **Code APE:** 8690E\n",
"\n",
"The handwritten section of the document is in French and reads:\n",
"\n",
"- \"Déclaire avoir reçu de M. M. (fils) G[obon], Acquitté la somme de 55 €\n",
"Pour 1 consultation en date du 05/04/2024\n",
"N°: 1750460-19212774\"\n",
"\n",
"At the bottom right, there is a signature that appears to be of Noël Breignaud, with a red stamp partially visible, which seems to contain the text \"Noël BREIGNAUD\" and other markings.\n",
"\n",
"The date in the handwritten section is \"05/04/2024,\" indicating the receipt or service provided on that date. The amount mentioned is 55 euros for one consultation.\n"
]
}
],
"source": [
"pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()\n",
"generation_config = dict(max_new_tokens=1024, do_sample=True)\n",
"\n",
"\n",
"\n",
"question = '<image>\\nPlease describe the image shortly.'\n",
"response = model.chat(tokenizer, pixel_values, question, generation_config)\n",
"print(f'User: {question}\\nAssistant: {response}')\n",
"\n",
"# single-image multi-round conversation (单图多轮对话)\n",
"question = '<image>\\nPlease describe the image in detail.'\n",
"response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)\n",
"print(f'User: {question}\\nAssistant: {response}')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "35dc90e0",
"metadata": {},
"outputs": [],
"source": [
"vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77f3720a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([7, 1025, 1024])\n"
]
}
],
"source": [
"patch_feats = vout.last_hidden_state # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn\n",
"print(patch_feats.shape)\n",
"# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]\n",
"# Cách an toàn chung: mean-pool\n",
"# img_vec = patch_feats.mean(dim=1) # [B, Dv]\n",
"# img_vec = torch.nn.functional.normalize(img_vec, dim=-1) # L2 normalize cho retrieval"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0043634c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([7, 1024])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# img_vec.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92032162",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}