{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "59f8a415", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/nguyendc/sonnh/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025-09-02 13:50:30.358544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1756821030.369428 3858431 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1756821030.372761 3858431 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "W0000 00:00:1756821030.382108 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1756821030.382119 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1756821030.382121 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1756821030.382123 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "2025-09-02 13:50:30.385619: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2025-09-02 13:50:35,304] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/bin/ld: cannot find -laio: No such file or directory\n", "collect2: error: ld returned 1 exit status\n", "/usr/bin/ld: cannot find -laio: No such file or directory\n", "collect2: error: ld returned 1 exit status\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using device: cuda\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", "- configuration_intern_vit.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", "- configuration_internvl_chat.py\n", "- configuration_intern_vit.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", "- conversation.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", "- modeling_intern_vit.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", "- modeling_internvl_chat.py\n", "- conversation.py\n", "- modeling_intern_vit.py\n", ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", "Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 7.56it/s]\n", "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00, 1.03s/it]\n" ] } ], "source": [ "import torch\n", "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel, InternVLChatModel\n", "# from qwen_vl_utils import process_vision_info\n", "from PIL import Image\n", "import os\n", "import numpy as np\n", "from tqdm import tqdm\n", "import math\n", "import torch\n", "from transformers import AutoTokenizer, AutoModel\n", "import timm\n", "\n", "# --- Configuration ---\n", "# MODEL_NAME = \"OpenGVLab/InternVL3_5-4B\" # You can choose other model sizes\n", "MODEL_NAME = \"OpenGVLab/InternVL3_5-4B-Instruct\"\n", "\n", "\n", "IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n", "BATCH_SIZE = 4\n", "# --- End Configuration ---\n", "\n", "# Check for GPU availability\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "print(f\"Using device: {device}\")\n", "\n", "# Load the model and processor\n", "\n", "model = AutoModel.from_pretrained(\n", " MODEL_NAME,\n", " torch_dtype=torch.bfloat16,\n", " use_flash_attn=True,\n", " attn_implementation=\"flash_attention_2\",\n", " trust_remote_code=True,\n", " device_map=\"cuda\").eval()\n", "\n", "processor = AutoProcessor.from_pretrained(\n", " MODEL_NAME, \n", " trust_remote_code=True\n", " )\n", "tokenizer = AutoTokenizer.from_pretrained(\n", " MODEL_NAME, \n", " trust_remote_code=True, \n", " use_fast=False\n", " )" ] }, { "cell_type": "code", "execution_count": 3, "id": "6d826d19", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "InternVLChatModel(\n", " (vision_model): InternVisionModel(\n", " (embeddings): InternVisionEmbeddings(\n", " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n", " )\n", " (encoder): InternVisionEncoder(\n", " (layers): ModuleList(\n", " (0-23): 24 x InternVisionEncoderLayer(\n", " (attn): InternAttention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " (inner_attn): FlashAttention()\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " )\n", " (mlp): InternMLP(\n", " (act): GELUActivation()\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " )\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (drop_path1): Identity()\n", " (drop_path2): Identity()\n", " )\n", " )\n", " )\n", " )\n", " (language_model): Qwen3ForCausalLM(\n", " (model): Qwen3Model(\n", " (embed_tokens): Embedding(151936, 2560)\n", " (layers): ModuleList(\n", " (0-35): 36 x Qwen3DecoderLayer(\n", " (self_attn): Qwen3Attention(\n", " (q_proj): Linear(in_features=2560, out_features=4096, bias=False)\n", " (k_proj): Linear(in_features=2560, out_features=1024, bias=False)\n", " (v_proj): Linear(in_features=2560, out_features=1024, bias=False)\n", " (o_proj): Linear(in_features=4096, out_features=2560, bias=False)\n", " (q_norm): Qwen3RMSNorm((128,), eps=1e-06)\n", " (k_norm): Qwen3RMSNorm((128,), eps=1e-06)\n", " )\n", " (mlp): Qwen3MLP(\n", " (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)\n", " (up_proj): Linear(in_features=2560, out_features=9728, bias=False)\n", " (down_proj): Linear(in_features=9728, out_features=2560, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n", " (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n", " )\n", " )\n", " (norm): Qwen3RMSNorm((2560,), eps=1e-06)\n", " (rotary_emb): Qwen3RotaryEmbedding()\n", " )\n", " (lm_head): Linear(in_features=2560, out_features=151936, bias=False)\n", " )\n", " (mlp1): Sequential(\n", " (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", " (1): Linear(in_features=4096, out_features=2560, bias=True)\n", " (2): GELU(approximate='none')\n", " (3): Linear(in_features=2560, out_features=2560, bias=True)\n", " )\n", ")" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model" ] }, { "cell_type": "code", "execution_count": 9, "id": "7bbfcf47", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "InternVisionModel(\n", " (embeddings): InternVisionEmbeddings(\n", " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n", " )\n", " (encoder): InternVisionEncoder(\n", " (layers): ModuleList(\n", " (0-23): 24 x InternVisionEncoderLayer(\n", " (attn): InternAttention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " (inner_attn): FlashAttention()\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " )\n", " (mlp): InternMLP(\n", " (act): GELUActivation()\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " )\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (drop_path1): Identity()\n", " (drop_path2): Identity()\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.vision_model" ] }, { "cell_type": "markdown", "id": "ae26d6cf", "metadata": {}, "source": [ "# demo ?" ] }, { "cell_type": "code", "execution_count": null, "id": "817d3ccb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "id": "d41f94bd", "metadata": {}, "outputs": [], "source": [ "# import math\n", "import numpy as np\n", "import torch\n", "import torchvision.transforms as T\n", "# from decord import VideoReader, cpu\n", "from PIL import Image\n", "from torchvision.transforms.functional import InterpolationMode\n", "# from modelscope import AutoModel, AutoTokenizer\n", "\n", "IMAGENET_MEAN = (0.485, 0.456, 0.406)\n", "IMAGENET_STD = (0.229, 0.224, 0.225)\n", "\n", "def build_transform(input_size):\n", " MEAN, STD = IMAGENET_MEAN, IMAGENET_STD\n", " transform = T.Compose([\n", " T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),\n", " T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),\n", " T.ToTensor(),\n", " T.Normalize(mean=MEAN, std=STD)\n", " ])\n", " return transform\n", "\n", "def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):\n", " best_ratio_diff = float('inf')\n", " best_ratio = (1, 1)\n", " area = width * height\n", " for ratio in target_ratios:\n", " target_aspect_ratio = ratio[0] / ratio[1]\n", " ratio_diff = abs(aspect_ratio - target_aspect_ratio)\n", " if ratio_diff < best_ratio_diff:\n", " best_ratio_diff = ratio_diff\n", " best_ratio = ratio\n", " elif ratio_diff == best_ratio_diff:\n", " if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:\n", " best_ratio = ratio\n", " return best_ratio\n", "\n", "def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):\n", " orig_width, orig_height = image.size\n", " aspect_ratio = orig_width / orig_height\n", "\n", " # calculate the existing image aspect ratio\n", " target_ratios = set(\n", " (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if\n", " i * j <= max_num and i * j >= min_num)\n", " target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])\n", "\n", " # find the closest aspect ratio to the target\n", " target_aspect_ratio = find_closest_aspect_ratio(\n", " aspect_ratio, target_ratios, orig_width, orig_height, image_size)\n", "\n", " # calculate the target width and height\n", " target_width = image_size * target_aspect_ratio[0]\n", " target_height = image_size * target_aspect_ratio[1]\n", " blocks = target_aspect_ratio[0] * target_aspect_ratio[1]\n", "\n", " # resize the image\n", " resized_img = image.resize((target_width, target_height))\n", " processed_images = []\n", " for i in range(blocks):\n", " box = (\n", " (i % (target_width // image_size)) * image_size,\n", " (i // (target_width // image_size)) * image_size,\n", " ((i % (target_width // image_size)) + 1) * image_size,\n", " ((i // (target_width // image_size)) + 1) * image_size\n", " )\n", " # split the image\n", " split_img = resized_img.crop(box)\n", " processed_images.append(split_img)\n", " assert len(processed_images) == blocks\n", " if use_thumbnail and len(processed_images) != 1:\n", " thumbnail_img = image.resize((image_size, image_size))\n", " processed_images.append(thumbnail_img)\n", " return processed_images\n", "\n", "def load_image(image_file, input_size=448, max_num=12):\n", " image = Image.open(image_file).convert('RGB')\n", " transform = build_transform(input_size=input_size)\n", " images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)\n", " pixel_values = [transform(image) for image in images]\n", " pixel_values = torch.stack(pixel_values)\n", " return pixel_values" ] }, { "cell_type": "markdown", "id": "f2ec71a4", "metadata": {}, "source": [ "# Attention pooling\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a404fa19", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "\n", "def gem_pool(x, p: float = 3.0, eps: float = 1e-6):\n", " # x: [B, N, D]\n", " return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)\n", "\n", "@torch.no_grad()\n", "def image_embedding(pixel_values, model, use_tiling=True):\n", " # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]\n", " out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)\n", " tok = out.last_hidden_state # [T, N, 1024] hoặc [1, N, 1024]\n", " if tok.dim() == 2: # phòng trường hợp model trả [N, D]\n", " tok = tok.unsqueeze(0)\n", "\n", " # 1) Attention pooling theo token, trong từng tile\n", " w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1) # [T,N,1]\n", " attn_tile = (tok * w_tok).sum(dim=1) # [T,1024]\n", "\n", " # 2) Các pooling khác theo token\n", " mean_tile = tok.mean(dim=1) # [T,1024]\n", " max_tile = tok.max(dim=1).values # [T,1024]\n", " gem_tile = gem_pool(tok, p=3.0) # [T,1024]\n", "\n", " # 3) Attention across-tiles (giữ multi-scale nhưng gọn)\n", " tile_scores = attn_tile.norm(dim=-1) # [T]\n", " w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1) # [T,1]\n", "\n", " mean_vec = (mean_tile * w_tile).sum(dim=0)\n", " max_vec = (max_tile * w_tile).sum(dim=0)\n", " gem_vec = (gem_tile * w_tile).sum(dim=0)\n", " attn_vec = (attn_tile * w_tile).sum(dim=0)\n", "\n", " # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin\n", " one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0) # [4*1024]\n", " one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0) # [1, 4096]\n", " return one_vec.half() # FP16 để tiết kiệm bộ nhớ" ] }, { "cell_type": "markdown", "id": "ed35a4ce", "metadata": {}, "source": [ "# pool" ] }, { "cell_type": "code", "execution_count": 6, "id": "3edf8b67", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "\n", "# --- Pooling theo token (trong 1 tile) ---\n", "def _pool_tokens(tokens: torch.Tensor, how: str = \"mean\") -> torch.Tensor:\n", " \"\"\"\n", " tokens: [1, N, D] hoặc [N, D]\n", " return: [D]\n", " \"\"\"\n", " if tokens.dim() == 3: # [1, N, D] -> [N, D]\n", " tokens = tokens.squeeze(0)\n", "\n", " if how == \"mean\":\n", " v = tokens.mean(dim=0)\n", " elif how == \"max\":\n", " v = tokens.max(dim=0).values\n", " elif how == \"gem\":\n", " p = 3.0\n", " v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)\n", " elif how == \"cls\":\n", " # chỉ dùng nếu backbone có CLS token ở vị trí đầu\n", " v = tokens[0]\n", " else:\n", " raise ValueError(f\"Unknown pooling: {how}\")\n", "\n", " return v\n", "\n", "\n", "@torch.no_grad()\n", "def image_embedding_global(model, pixel_values: torch.Tensor,\n", " pool: str = \"mean\",\n", " normalize: bool = False,\n", " global_index: int = 0,\n", " use_projector: bool = False) -> torch.Tensor:\n", " \"\"\"\n", " Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.\n", " - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]\n", " - global_index: thường = 0 (tile toàn ảnh nằm đầu)\n", " - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)\n", " \"\"\"\n", " model.eval()\n", " device = next(model.parameters()).device\n", " x = pixel_values.to(device)\n", "\n", " out = model.vision_model(pixel_values=x) # last_hidden_state: [T, N, D] hoặc [1, N, D]\n", " tok = out.last_hidden_state\n", "\n", " # chọn global tile\n", " if tok.size(0) > 1:\n", " tok = tok[global_index:global_index+1] # [1, N, D]\n", "\n", " # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!\n", " if use_projector:\n", " # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)\n", " in_feat = getattr(model.mlp1[1], \"in_features\", None)\n", " if in_feat is not None and tok.size(-1) == in_feat:\n", " tok = model.mlp1(tok) # [1, N, D’]\n", " else:\n", " raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n", "\n", " v = _pool_tokens(tok, how=pool) # [D]\n", " if normalize:\n", " v = F.normalize(v, dim=-1)\n", " return v.unsqueeze(0) # [1, D]\n", "\n", "\n", "@torch.no_grad()\n", "def image_embedding_mean(model, pixel_values: torch.Tensor,\n", " pool: str = \"mean\",\n", " normalize: bool = True,\n", " use_projector: bool = False) -> torch.Tensor:\n", " \"\"\"\n", " Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:\n", " (1) pool theo token trong từng tile → [T, D]\n", " (2) lấy mean across-tiles → [D]\n", " \"\"\"\n", " model.eval()\n", " device = next(model.parameters()).device\n", " x = pixel_values.to(device)\n", "\n", " out = model.vision_model(pixel_values=x)\n", " tok = out.last_hidden_state # [T, N, D] hoặc [1, N, D]\n", "\n", " if use_projector:\n", " in_feat = getattr(model.mlp1[1], \"in_features\", None)\n", " if in_feat is not None and tok.size(-1) == in_feat:\n", " tok = model.mlp1(tok)\n", " else:\n", " raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n", "\n", " # pool theo token trong từng tile\n", " T = tok.size(0)\n", " per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ] # list of [D]\n", " per_tile = torch.stack(per_tile, dim=0) # [T, D]\n", "\n", " # mean across-tiles\n", " v = per_tile.mean(dim=0) # [D]\n", " if normalize:\n", " v = F.normalize(v, dim=-1)\n", " return v.unsqueeze(0) # [1, D]\n" ] }, { "cell_type": "markdown", "id": "613cf001", "metadata": {}, "source": [ "# infer" ] }, { "cell_type": "code", "execution_count": 7, "id": "cdfdab0e", "metadata": {}, "outputs": [], "source": [ "def get_image_embedding(path):\n", " \"\"\"\n", " Processes a batch of images and extracts their embeddings.\n", " \"\"\"\n", " images_pil = []\n", " valid_paths = []\n", " if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n", " try:\n", " # The processor expects PIL images in RGB format\n", " # images_pil.append(Image.open(path).convert(\"RGB\"))\n", " # print(path)\n", " valid_paths.append(path)\n", " except Exception as e:\n", " print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n", "\n", " if not valid_paths:\n", " return np.array([]), []\n", "\n", " all_pixel_values = []\n", " for valid_path in valid_paths:\n", " pixel_values = load_image(valid_path, max_num=12).to(torch.bfloat16).cuda()\n", " # print(pixel_values.shape)\n", " all_pixel_values.append(pixel_values)\n", " # For pure vision feature extraction, we can provide an empty text prompt.\n", " # The processor handles tokenizing text and preparing images.\n", " inputs = torch.cat(all_pixel_values, dim=0).to(device)\n", " \n", " # embeddings = image_embedding(inputs, model, use_tiling=True)\n", " embeddings = image_embedding_mean(model, inputs)\n", " \n", " return embeddings.to(torch.float16).cpu().numpy()" ] }, { "cell_type": "code", "execution_count": 8, "id": "cdaebb7b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 2800/2800 [20:51<00:00, 2.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Embeddings extracted and saved.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import json\n", "\n", "# --- Process all images in the directory ---\n", "image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n", "all_embeddings = []\n", "filepaths = []\n", "BATCH_SIZE = 1\n", "\n", "with open(\"embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\", \"w\") as f:\n", "\n", " f.write(\"[\\n\")\n", " first = True\n", " for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n", " batch_paths = image_files[i]\n", " batch_embeddings = get_image_embedding(batch_paths)\n", " embeddings_list = [emb.tolist() for emb in batch_embeddings]\n", " for path, emb in zip(batch_paths, embeddings_list):\n", " if not first:\n", " f.write(\",\\n\")\n", " json.dump({\"filepath\": path, \"embedding\": emb}, f)\n", " first = False\n", " f.write(\"\\n]\\n\")\n", "\n", "print(\"Embeddings extracted and saved.\")" ] }, { "cell_type": "markdown", "id": "f0d0bf0a", "metadata": {}, "source": [ "# check" ] }, { "cell_type": "code", "execution_count": 10, "id": "0772fc89", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 2800 samples with embedding dimension 1024\n", "Applied L2 normalization to embeddings\n", "(2800, 1024)\n", "(3918600,)\n", "mean sim: 0.9939966 std: 0.0073577887\n" ] } ], "source": [ "from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n", "from sklearn.preprocessing import normalize\n", "from sklearn.metrics import silhouette_score\n", "from sklearn.neighbors import NearestNeighbors\n", "from sklearn.decomposition import PCA\n", "import argparse\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from datetime import datetime\n", "\n", "\n", "embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\"\n", "with open(embeddings_path, 'r') as f:\n", " data = json.load(f)\n", "\n", "file_paths = []\n", "embeddings_list = []\n", "\n", "for item in data:\n", " file_paths.append(item['filepath'])\n", " embeddings_list.append(item['embedding'])\n", "\n", "embeddings = np.array(embeddings_list, dtype=np.float32)\n", "print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n", "\n", "# Normalize embeddings using L2 normalization for cosine distance\n", "embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n", "print(\"Applied L2 normalization to embeddings\")\n", "\n", "sims = cosine_similarity(embeddings)\n", "print(embeddings.shape)\n", "# lấy upper triangle exclude diagonal để inspect\n", "triu_idxs = np.triu_indices_from(sims, k=1)\n", "dist_vals = sims[triu_idxs]\n", "print(dist_vals.shape)\n", "print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())" ] }, { "cell_type": "markdown", "id": "cb4ea42b", "metadata": {}, "source": [ "# temp" ] }, { "cell_type": "code", "execution_count": 9, "id": "2c3e6dd0", "metadata": {}, "outputs": [], "source": [ "image_path = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg\"" ] }, { "cell_type": "code", "execution_count": 11, "id": "29620d93", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n", "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "User: \n", "Please describe the image shortly.\n", "Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address \"104, cours des fossés, 33210 Langon\" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.\n", "User: \n", "Please describe the image in detail.\n", "Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:\n", "\n", "- **Name:** Noël Breignaud\n", "- **Profession:** Ouestopathe (Osteopath)\n", "- **Address:** 104, cours des fossés, 33210 Lagnon\n", "- **Phone Number:** Tel. 06 88 70 66 43\n", "\n", "On the right side, there are registration and identification numbers:\n", "\n", "- **Nº SIRET:** 510 123 631 00010\n", "- **Nº ADELI:** 330001108\n", "- **Code APE:** 8690E\n", "\n", "The handwritten section of the document is in French and reads:\n", "\n", "- \"Déclaire avoir reçu de M. M. (fils) G[obon], Acquitté la somme de 55 €\n", "Pour 1 consultation en date du 05/04/2024\n", "N°: 1750460-19212774\"\n", "\n", "At the bottom right, there is a signature that appears to be of Noël Breignaud, with a red stamp partially visible, which seems to contain the text \"Noël BREIGNAUD\" and other markings.\n", "\n", "The date in the handwritten section is \"05/04/2024,\" indicating the receipt or service provided on that date. The amount mentioned is 55 euros for one consultation.\n" ] } ], "source": [ "pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()\n", "generation_config = dict(max_new_tokens=1024, do_sample=True)\n", "\n", "\n", "\n", "question = '\\nPlease describe the image shortly.'\n", "response = model.chat(tokenizer, pixel_values, question, generation_config)\n", "print(f'User: {question}\\nAssistant: {response}')\n", "\n", "# single-image multi-round conversation (单图多轮对话)\n", "question = '\\nPlease describe the image in detail.'\n", "response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)\n", "print(f'User: {question}\\nAssistant: {response}')" ] }, { "cell_type": "code", "execution_count": 23, "id": "35dc90e0", "metadata": {}, "outputs": [], "source": [ "vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "77f3720a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([7, 1025, 1024])\n" ] } ], "source": [ "patch_feats = vout.last_hidden_state # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn\n", "print(patch_feats.shape)\n", "# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]\n", "# Cách an toàn chung: mean-pool\n", "# img_vec = patch_feats.mean(dim=1) # [B, Dv]\n", "# img_vec = torch.nn.functional.normalize(img_vec, dim=-1) # L2 normalize cho retrieval" ] }, { "cell_type": "code", "execution_count": null, "id": "0043634c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([7, 1024])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# img_vec.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "92032162", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }