diff --git a/extract/clustering_example_InternVL3_5.ipynb b/extract/clustering_example_InternVL3_5.ipynb new file mode 100644 index 0000000..88d37b3 --- /dev/null +++ b/extract/clustering_example_InternVL3_5.ipynb @@ -0,0 +1,874 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "59f8a415", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/nguyendc/sonnh/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 13:50:30.358544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1756821030.369428 3858431 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "E0000 00:00:1756821030.372761 3858431 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "W0000 00:00:1756821030.382108 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "W0000 00:00:1756821030.382119 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "W0000 00:00:1756821030.382121 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "W0000 00:00:1756821030.382123 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "2025-09-02 13:50:30.385619: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-09-02 13:50:35,304] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/bin/ld: cannot find -laio: No such file or directory\n", + "collect2: error: ld returned 1 exit status\n", + "/usr/bin/ld: cannot find -laio: No such file or directory\n", + "collect2: error: ld returned 1 exit status\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", + "- configuration_intern_vit.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", + "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", + "- configuration_internvl_chat.py\n", + "- configuration_intern_vit.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", + "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", + "- conversation.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", + "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", + "- modeling_intern_vit.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", + "A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n", + "- modeling_internvl_chat.py\n", + "- conversation.py\n", + "- modeling_intern_vit.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n", + "Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 7.56it/s]\n", + "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00, 1.03s/it]\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel, InternVLChatModel\n", + "# from qwen_vl_utils import process_vision_info\n", + "from PIL import Image\n", + "import os\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "import math\n", + "import torch\n", + "from transformers import AutoTokenizer, AutoModel\n", + "import timm\n", + "\n", + "# --- Configuration ---\n", + "# MODEL_NAME = \"OpenGVLab/InternVL3_5-4B\" # You can choose other model sizes\n", + "MODEL_NAME = \"OpenGVLab/InternVL3_5-4B-Instruct\"\n", + "\n", + "\n", + "IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n", + "BATCH_SIZE = 4\n", + "# --- End Configuration ---\n", + "\n", + "# Check for GPU availability\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "print(f\"Using device: {device}\")\n", + "\n", + "# Load the model and processor\n", + "\n", + "model = AutoModel.from_pretrained(\n", + " MODEL_NAME,\n", + " torch_dtype=torch.bfloat16,\n", + " use_flash_attn=True,\n", + " attn_implementation=\"flash_attention_2\",\n", + " trust_remote_code=True,\n", + " device_map=\"cuda\").eval()\n", + "\n", + "processor = AutoProcessor.from_pretrained(\n", + " MODEL_NAME, \n", + " trust_remote_code=True\n", + " )\n", + "tokenizer = AutoTokenizer.from_pretrained(\n", + " MODEL_NAME, \n", + " trust_remote_code=True, \n", + " use_fast=False\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d826d19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "InternVLChatModel(\n", + " (vision_model): InternVisionModel(\n", + " (embeddings): InternVisionEmbeddings(\n", + " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n", + " )\n", + " (encoder): InternVisionEncoder(\n", + " (layers): ModuleList(\n", + " (0-23): 24 x InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): Identity()\n", + " (drop_path2): Identity()\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (language_model): Qwen3ForCausalLM(\n", + " (model): Qwen3Model(\n", + " (embed_tokens): Embedding(151936, 2560)\n", + " (layers): ModuleList(\n", + " (0-35): 36 x Qwen3DecoderLayer(\n", + " (self_attn): Qwen3Attention(\n", + " (q_proj): Linear(in_features=2560, out_features=4096, bias=False)\n", + " (k_proj): Linear(in_features=2560, out_features=1024, bias=False)\n", + " (v_proj): Linear(in_features=2560, out_features=1024, bias=False)\n", + " (o_proj): Linear(in_features=4096, out_features=2560, bias=False)\n", + " (q_norm): Qwen3RMSNorm((128,), eps=1e-06)\n", + " (k_norm): Qwen3RMSNorm((128,), eps=1e-06)\n", + " )\n", + " (mlp): Qwen3MLP(\n", + " (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)\n", + " (up_proj): Linear(in_features=2560, out_features=9728, bias=False)\n", + " (down_proj): Linear(in_features=9728, out_features=2560, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n", + " (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n", + " )\n", + " )\n", + " (norm): Qwen3RMSNorm((2560,), eps=1e-06)\n", + " (rotary_emb): Qwen3RotaryEmbedding()\n", + " )\n", + " (lm_head): Linear(in_features=2560, out_features=151936, bias=False)\n", + " )\n", + " (mlp1): Sequential(\n", + " (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", + " (1): Linear(in_features=4096, out_features=2560, bias=True)\n", + " (2): GELU(approximate='none')\n", + " (3): Linear(in_features=2560, out_features=2560, bias=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7bbfcf47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "InternVisionModel(\n", + " (embeddings): InternVisionEmbeddings(\n", + " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n", + " )\n", + " (encoder): InternVisionEncoder(\n", + " (layers): ModuleList(\n", + " (0-23): 24 x InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): Identity()\n", + " (drop_path2): Identity()\n", + " )\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.vision_model" + ] + }, + { + "cell_type": "markdown", + "id": "ae26d6cf", + "metadata": {}, + "source": [ + "# demo ?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "817d3ccb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d41f94bd", + "metadata": {}, + "outputs": [], + "source": [ + "# import math\n", + "import numpy as np\n", + "import torch\n", + "import torchvision.transforms as T\n", + "# from decord import VideoReader, cpu\n", + "from PIL import Image\n", + "from torchvision.transforms.functional import InterpolationMode\n", + "# from modelscope import AutoModel, AutoTokenizer\n", + "\n", + "IMAGENET_MEAN = (0.485, 0.456, 0.406)\n", + "IMAGENET_STD = (0.229, 0.224, 0.225)\n", + "\n", + "def build_transform(input_size):\n", + " MEAN, STD = IMAGENET_MEAN, IMAGENET_STD\n", + " transform = T.Compose([\n", + " T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),\n", + " T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),\n", + " T.ToTensor(),\n", + " T.Normalize(mean=MEAN, std=STD)\n", + " ])\n", + " return transform\n", + "\n", + "def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):\n", + " best_ratio_diff = float('inf')\n", + " best_ratio = (1, 1)\n", + " area = width * height\n", + " for ratio in target_ratios:\n", + " target_aspect_ratio = ratio[0] / ratio[1]\n", + " ratio_diff = abs(aspect_ratio - target_aspect_ratio)\n", + " if ratio_diff < best_ratio_diff:\n", + " best_ratio_diff = ratio_diff\n", + " best_ratio = ratio\n", + " elif ratio_diff == best_ratio_diff:\n", + " if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:\n", + " best_ratio = ratio\n", + " return best_ratio\n", + "\n", + "def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):\n", + " orig_width, orig_height = image.size\n", + " aspect_ratio = orig_width / orig_height\n", + "\n", + " # calculate the existing image aspect ratio\n", + " target_ratios = set(\n", + " (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if\n", + " i * j <= max_num and i * j >= min_num)\n", + " target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])\n", + "\n", + " # find the closest aspect ratio to the target\n", + " target_aspect_ratio = find_closest_aspect_ratio(\n", + " aspect_ratio, target_ratios, orig_width, orig_height, image_size)\n", + "\n", + " # calculate the target width and height\n", + " target_width = image_size * target_aspect_ratio[0]\n", + " target_height = image_size * target_aspect_ratio[1]\n", + " blocks = target_aspect_ratio[0] * target_aspect_ratio[1]\n", + "\n", + " # resize the image\n", + " resized_img = image.resize((target_width, target_height))\n", + " processed_images = []\n", + " for i in range(blocks):\n", + " box = (\n", + " (i % (target_width // image_size)) * image_size,\n", + " (i // (target_width // image_size)) * image_size,\n", + " ((i % (target_width // image_size)) + 1) * image_size,\n", + " ((i // (target_width // image_size)) + 1) * image_size\n", + " )\n", + " # split the image\n", + " split_img = resized_img.crop(box)\n", + " processed_images.append(split_img)\n", + " assert len(processed_images) == blocks\n", + " if use_thumbnail and len(processed_images) != 1:\n", + " thumbnail_img = image.resize((image_size, image_size))\n", + " processed_images.append(thumbnail_img)\n", + " return processed_images\n", + "\n", + "def load_image(image_file, input_size=448, max_num=12):\n", + " image = Image.open(image_file).convert('RGB')\n", + " transform = build_transform(input_size=input_size)\n", + " images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)\n", + " pixel_values = [transform(image) for image in images]\n", + " pixel_values = torch.stack(pixel_values)\n", + " return pixel_values" + ] + }, + { + "cell_type": "markdown", + "id": "f2ec71a4", + "metadata": {}, + "source": [ + "# Attention pooling\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a404fa19", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "def gem_pool(x, p: float = 3.0, eps: float = 1e-6):\n", + " # x: [B, N, D]\n", + " return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)\n", + "\n", + "@torch.no_grad()\n", + "def image_embedding(pixel_values, model, use_tiling=True):\n", + " # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]\n", + " out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)\n", + " tok = out.last_hidden_state # [T, N, 1024] hoặc [1, N, 1024]\n", + " if tok.dim() == 2: # phòng trường hợp model trả [N, D]\n", + " tok = tok.unsqueeze(0)\n", + "\n", + " # 1) Attention pooling theo token, trong từng tile\n", + " w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1) # [T,N,1]\n", + " attn_tile = (tok * w_tok).sum(dim=1) # [T,1024]\n", + "\n", + " # 2) Các pooling khác theo token\n", + " mean_tile = tok.mean(dim=1) # [T,1024]\n", + " max_tile = tok.max(dim=1).values # [T,1024]\n", + " gem_tile = gem_pool(tok, p=3.0) # [T,1024]\n", + "\n", + " # 3) Attention across-tiles (giữ multi-scale nhưng gọn)\n", + " tile_scores = attn_tile.norm(dim=-1) # [T]\n", + " w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1) # [T,1]\n", + "\n", + " mean_vec = (mean_tile * w_tile).sum(dim=0)\n", + " max_vec = (max_tile * w_tile).sum(dim=0)\n", + " gem_vec = (gem_tile * w_tile).sum(dim=0)\n", + " attn_vec = (attn_tile * w_tile).sum(dim=0)\n", + "\n", + " # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin\n", + " one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0) # [4*1024]\n", + " one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0) # [1, 4096]\n", + " return one_vec.half() # FP16 để tiết kiệm bộ nhớ" + ] + }, + { + "cell_type": "markdown", + "id": "ed35a4ce", + "metadata": {}, + "source": [ + "# pool" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3edf8b67", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "# --- Pooling theo token (trong 1 tile) ---\n", + "def _pool_tokens(tokens: torch.Tensor, how: str = \"mean\") -> torch.Tensor:\n", + " \"\"\"\n", + " tokens: [1, N, D] hoặc [N, D]\n", + " return: [D]\n", + " \"\"\"\n", + " if tokens.dim() == 3: # [1, N, D] -> [N, D]\n", + " tokens = tokens.squeeze(0)\n", + "\n", + " if how == \"mean\":\n", + " v = tokens.mean(dim=0)\n", + " elif how == \"max\":\n", + " v = tokens.max(dim=0).values\n", + " elif how == \"gem\":\n", + " p = 3.0\n", + " v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)\n", + " elif how == \"cls\":\n", + " # chỉ dùng nếu backbone có CLS token ở vị trí đầu\n", + " v = tokens[0]\n", + " else:\n", + " raise ValueError(f\"Unknown pooling: {how}\")\n", + "\n", + " return v\n", + "\n", + "\n", + "@torch.no_grad()\n", + "def image_embedding_global(model, pixel_values: torch.Tensor,\n", + " pool: str = \"mean\",\n", + " normalize: bool = False,\n", + " global_index: int = 0,\n", + " use_projector: bool = False) -> torch.Tensor:\n", + " \"\"\"\n", + " Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.\n", + " - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]\n", + " - global_index: thường = 0 (tile toàn ảnh nằm đầu)\n", + " - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)\n", + " \"\"\"\n", + " model.eval()\n", + " device = next(model.parameters()).device\n", + " x = pixel_values.to(device)\n", + "\n", + " out = model.vision_model(pixel_values=x) # last_hidden_state: [T, N, D] hoặc [1, N, D]\n", + " tok = out.last_hidden_state\n", + "\n", + " # chọn global tile\n", + " if tok.size(0) > 1:\n", + " tok = tok[global_index:global_index+1] # [1, N, D]\n", + "\n", + " # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!\n", + " if use_projector:\n", + " # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)\n", + " in_feat = getattr(model.mlp1[1], \"in_features\", None)\n", + " if in_feat is not None and tok.size(-1) == in_feat:\n", + " tok = model.mlp1(tok) # [1, N, D’]\n", + " else:\n", + " raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n", + "\n", + " v = _pool_tokens(tok, how=pool) # [D]\n", + " if normalize:\n", + " v = F.normalize(v, dim=-1)\n", + " return v.unsqueeze(0) # [1, D]\n", + "\n", + "\n", + "@torch.no_grad()\n", + "def image_embedding_mean(model, pixel_values: torch.Tensor,\n", + " pool: str = \"mean\",\n", + " normalize: bool = True,\n", + " use_projector: bool = False) -> torch.Tensor:\n", + " \"\"\"\n", + " Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:\n", + " (1) pool theo token trong từng tile → [T, D]\n", + " (2) lấy mean across-tiles → [D]\n", + " \"\"\"\n", + " model.eval()\n", + " device = next(model.parameters()).device\n", + " x = pixel_values.to(device)\n", + "\n", + " out = model.vision_model(pixel_values=x)\n", + " tok = out.last_hidden_state # [T, N, D] hoặc [1, N, D]\n", + "\n", + " if use_projector:\n", + " in_feat = getattr(model.mlp1[1], \"in_features\", None)\n", + " if in_feat is not None and tok.size(-1) == in_feat:\n", + " tok = model.mlp1(tok)\n", + " else:\n", + " raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n", + "\n", + " # pool theo token trong từng tile\n", + " T = tok.size(0)\n", + " per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ] # list of [D]\n", + " per_tile = torch.stack(per_tile, dim=0) # [T, D]\n", + "\n", + " # mean across-tiles\n", + " v = per_tile.mean(dim=0) # [D]\n", + " if normalize:\n", + " v = F.normalize(v, dim=-1)\n", + " return v.unsqueeze(0) # [1, D]\n" + ] + }, + { + "cell_type": "markdown", + "id": "613cf001", + "metadata": {}, + "source": [ + "# infer" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cdfdab0e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_image_embedding(path):\n", + " \"\"\"\n", + " Processes a batch of images and extracts their embeddings.\n", + " \"\"\"\n", + " images_pil = []\n", + " valid_paths = []\n", + " if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n", + " try:\n", + " # The processor expects PIL images in RGB format\n", + " # images_pil.append(Image.open(path).convert(\"RGB\"))\n", + " # print(path)\n", + " valid_paths.append(path)\n", + " except Exception as e:\n", + " print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n", + "\n", + " if not valid_paths:\n", + " return np.array([]), []\n", + "\n", + " all_pixel_values = []\n", + " for valid_path in valid_paths:\n", + " pixel_values = load_image(valid_path, max_num=12).to(torch.bfloat16).cuda()\n", + " # print(pixel_values.shape)\n", + " all_pixel_values.append(pixel_values)\n", + " # For pure vision feature extraction, we can provide an empty text prompt.\n", + " # The processor handles tokenizing text and preparing images.\n", + " inputs = torch.cat(all_pixel_values, dim=0).to(device)\n", + " \n", + " # embeddings = image_embedding(inputs, model, use_tiling=True)\n", + " embeddings = image_embedding_mean(model, inputs)\n", + " \n", + " return embeddings.to(torch.float16).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cdaebb7b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2800/2800 [20:51<00:00, 2.24it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings extracted and saved.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "# --- Process all images in the directory ---\n", + "image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n", + "all_embeddings = []\n", + "filepaths = []\n", + "BATCH_SIZE = 1\n", + "\n", + "with open(\"embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\", \"w\") as f:\n", + "\n", + " f.write(\"[\\n\")\n", + " first = True\n", + " for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n", + " batch_paths = image_files[i]\n", + " batch_embeddings = get_image_embedding(batch_paths)\n", + " embeddings_list = [emb.tolist() for emb in batch_embeddings]\n", + " for path, emb in zip(batch_paths, embeddings_list):\n", + " if not first:\n", + " f.write(\",\\n\")\n", + " json.dump({\"filepath\": path, \"embedding\": emb}, f)\n", + " first = False\n", + " f.write(\"\\n]\\n\")\n", + "\n", + "print(\"Embeddings extracted and saved.\")" + ] + }, + { + "cell_type": "markdown", + "id": "f0d0bf0a", + "metadata": {}, + "source": [ + "# check" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0772fc89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 2800 samples with embedding dimension 1024\n", + "Applied L2 normalization to embeddings\n", + "(2800, 1024)\n", + "(3918600,)\n", + "mean sim: 0.9939966 std: 0.0073577887\n" + ] + } + ], + "source": [ + "from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.metrics import silhouette_score\n", + "from sklearn.neighbors import NearestNeighbors\n", + "from sklearn.decomposition import PCA\n", + "import argparse\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from datetime import datetime\n", + "\n", + "\n", + "embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\"\n", + "with open(embeddings_path, 'r') as f:\n", + " data = json.load(f)\n", + "\n", + "file_paths = []\n", + "embeddings_list = []\n", + "\n", + "for item in data:\n", + " file_paths.append(item['filepath'])\n", + " embeddings_list.append(item['embedding'])\n", + "\n", + "embeddings = np.array(embeddings_list, dtype=np.float32)\n", + "print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n", + "\n", + "# Normalize embeddings using L2 normalization for cosine distance\n", + "embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n", + "print(\"Applied L2 normalization to embeddings\")\n", + "\n", + "sims = cosine_similarity(embeddings)\n", + "print(embeddings.shape)\n", + "# lấy upper triangle exclude diagonal để inspect\n", + "triu_idxs = np.triu_indices_from(sims, k=1)\n", + "dist_vals = sims[triu_idxs]\n", + "print(dist_vals.shape)\n", + "print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())" + ] + }, + { + "cell_type": "markdown", + "id": "cb4ea42b", + "metadata": {}, + "source": [ + "# temp" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2c3e6dd0", + "metadata": {}, + "outputs": [], + "source": [ + "image_path = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "29620d93", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User: \n", + "Please describe the image shortly.\n", + "Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address \"104, cours des fossés, 33210 Langon\" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.\n", + "User: \n", + "Please describe the image in detail.\n", + "Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:\n", + "\n", + "- **Name:** Noël Breignaud\n", + "- **Profession:** Ouestopathe (Osteopath)\n", + "- **Address:** 104, cours des fossés, 33210 Lagnon\n", + "- **Phone Number:** Tel. 06 88 70 66 43\n", + "\n", + "On the right side, there are registration and identification numbers:\n", + "\n", + "- **Nº SIRET:** 510 123 631 00010\n", + "- **Nº ADELI:** 330001108\n", + "- **Code APE:** 8690E\n", + "\n", + "The handwritten section of the document is in French and reads:\n", + "\n", + "- \"Déclaire avoir reçu de M. M. (fils) G[obon], Acquitté la somme de 55 €\n", + "Pour 1 consultation en date du 05/04/2024\n", + "N°: 1750460-19212774\"\n", + "\n", + "At the bottom right, there is a signature that appears to be of Noël Breignaud, with a red stamp partially visible, which seems to contain the text \"Noël BREIGNAUD\" and other markings.\n", + "\n", + "The date in the handwritten section is \"05/04/2024,\" indicating the receipt or service provided on that date. The amount mentioned is 55 euros for one consultation.\n" + ] + } + ], + "source": [ + "pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()\n", + "generation_config = dict(max_new_tokens=1024, do_sample=True)\n", + "\n", + "\n", + "\n", + "question = '\\nPlease describe the image shortly.'\n", + "response = model.chat(tokenizer, pixel_values, question, generation_config)\n", + "print(f'User: {question}\\nAssistant: {response}')\n", + "\n", + "# single-image multi-round conversation (单图多轮对话)\n", + "question = '\\nPlease describe the image in detail.'\n", + "response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)\n", + "print(f'User: {question}\\nAssistant: {response}')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "35dc90e0", + "metadata": {}, + "outputs": [], + "source": [ + "vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77f3720a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([7, 1025, 1024])\n" + ] + } + ], + "source": [ + "patch_feats = vout.last_hidden_state # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn\n", + "print(patch_feats.shape)\n", + "# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]\n", + "# Cách an toàn chung: mean-pool\n", + "# img_vec = patch_feats.mean(dim=1) # [B, Dv]\n", + "# img_vec = torch.nn.functional.normalize(img_vec, dim=-1) # L2 normalize cho retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0043634c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([7, 1024])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# img_vec.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92032162", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/extract/clustering_example_qwen.ipynb b/extract/clustering_example_qwen.ipynb new file mode 100644 index 0000000..3974680 --- /dev/null +++ b/extract/clustering_example_qwen.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "59f8a415", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-09-02 15:00:12.976185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1756825212.987686 3903757 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "E0000 00:00:1756825212.991038 3903757 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "W0000 00:00:1756825213.000855 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "W0000 00:00:1756825213.000880 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "W0000 00:00:1756825213.000882 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "W0000 00:00:1756825213.000884 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", + "2025-09-02 15:00:13.005218: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-09-02 15:00:17,970] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/bin/ld: cannot find -laio: No such file or directory\n", + "collect2: error: ld returned 1 exit status\n", + "/usr/bin/ld: cannot find -laio: No such file or directory\n", + "collect2: error: ld returned 1 exit status\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.09it/s]\n", + "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n", + "# from qwen_vl_utils import process_vision_info\n", + "from PIL import Image\n", + "import os\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "\n", + "# --- Configuration ---\n", + "MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\" # You can choose other model sizes\n", + "\n", + "IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n", + "BATCH_SIZE = 4\n", + "# --- End Configuration ---\n", + "\n", + "# Check for GPU availability\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "print(f\"Using device: {device}\")\n", + "\n", + "# Load the model and processor\n", + "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n", + " MODEL_NAME, torch_dtype=\"bfloat16\", device_map=\"cuda\", attn_implementation=\"flash_attention_2\",\n", + ")\n", + "processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "13479e1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2_5_VLProcessor:\n", + "- image_processor: Qwen2VLImageProcessor {\n", + " \"do_convert_rgb\": true,\n", + " \"do_normalize\": true,\n", + " \"do_rescale\": true,\n", + " \"do_resize\": true,\n", + " \"image_mean\": [\n", + " 0.48145466,\n", + " 0.4578275,\n", + " 0.40821073\n", + " ],\n", + " \"image_processor_type\": \"Qwen2VLImageProcessor\",\n", + " \"image_std\": [\n", + " 0.26862954,\n", + " 0.26130258,\n", + " 0.27577711\n", + " ],\n", + " \"max_pixels\": 12845056,\n", + " \"merge_size\": 2,\n", + " \"min_pixels\": 3136,\n", + " \"patch_size\": 14,\n", + " \"processor_class\": \"Qwen2_5_VLProcessor\",\n", + " \"resample\": 3,\n", + " \"rescale_factor\": 0.00392156862745098,\n", + " \"size\": {\n", + " \"longest_edge\": 12845056,\n", + " \"shortest_edge\": 3136\n", + " },\n", + " \"temporal_patch_size\": 2\n", + "}\n", + "\n", + "- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n", + "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151657: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151658: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "}\n", + ")\n", + "\n", + "{\n", + " \"processor_class\": \"Qwen2_5_VLProcessor\"\n", + "}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdfdab0e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_image_embeddings(image_paths):\n", + " \"\"\"\n", + " Processes a batch of images and extracts their embeddings.\n", + " \"\"\"\n", + " images_pil = []\n", + " valid_paths = []\n", + " for path in image_paths:\n", + " if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n", + " try:\n", + " # The processor expects PIL images in RGB format\n", + " images_pil.append(Image.open(path).convert(\"RGB\"))\n", + " valid_paths.append(path)\n", + " except Exception as e:\n", + " print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n", + "\n", + " if not images_pil:\n", + " return np.array([]), []\n", + "\n", + " # For pure vision feature extraction, we can provide an empty text prompt.\n", + " # The processor handles tokenizing text and preparing images.\n", + " inputs = processor(\n", + " text=[\"\"] * len(images_pil),\n", + " images=images_pil,\n", + " padding=True,\n", + " return_tensors=\"pt\"\n", + " ).to(device)\n", + "\n", + " with torch.no_grad():\n", + " # Get the vision embeddings from the model's vision tower\n", + " vision_outputs = model.visual(inputs['pixel_values'].to(dtype=model.dtype), grid_thw=inputs['image_grid_thw'])\n", + " # We'll use the pooled output as the embedding\n", + " embeddings = vision_outputs\n", + "\n", + " return embeddings.to(torch.float16).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdaebb7b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 700/700 [22:12<00:00, 1.90s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings extracted and saved.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "# --- Process all images in the directory ---\n", + "image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n", + "all_embeddings = []\n", + "filepaths = []\n", + "\n", + "with open(\"embeddings_factures_osteopathie_1k_qwen.json\", \"w\") as f:\n", + "\n", + " f.write(\"[\\n\")\n", + " first = True\n", + " for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n", + " batch_paths = image_files[i:i+BATCH_SIZE]\n", + " batch_embeddings = get_image_embeddings(batch_paths)\n", + " embeddings_list = [emb.tolist() for emb in batch_embeddings]\n", + " for path, emb in zip(batch_paths, embeddings_list):\n", + " if not first:\n", + " f.write(\",\\n\")\n", + " json.dump({\"filepath\": path, \"embedding\": emb}, f)\n", + " first = False\n", + " f.write(\"\\n]\\n\")\n", + "\n", + "print(\"Embeddings extracted and saved.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2c3e6dd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 2800 samples with embedding dimension 2048\n", + "Applied L2 normalization to embeddings\n", + "(2800, 2048)\n", + "(3918600,)\n", + "mean sim: 0.37961555 std: 0.22605234\n" + ] + } + ], + "source": [ + "from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.metrics import silhouette_score\n", + "from sklearn.neighbors import NearestNeighbors\n", + "from sklearn.decomposition import PCA\n", + "import argparse\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from datetime import datetime\n", + "import json\n", + "\n", + "embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json\"\n", + "with open(embeddings_path, 'r') as f:\n", + " data = json.load(f)\n", + "\n", + "file_paths = []\n", + "embeddings_list = []\n", + "\n", + "for item in data:\n", + " file_paths.append(item['filepath'])\n", + " embeddings_list.append(item['embedding'])\n", + "\n", + "embeddings = np.array(embeddings_list, dtype=np.float32)\n", + "print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n", + "\n", + "# Normalize embeddings using L2 normalization for cosine distance\n", + "embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n", + "print(\"Applied L2 normalization to embeddings\")\n", + "\n", + "sims = cosine_similarity(embeddings)\n", + "print(embeddings.shape)\n", + "# lấy upper triangle exclude diagonal để inspect\n", + "triu_idxs = np.triu_indices_from(sims, k=1)\n", + "dist_vals = sims[triu_idxs]\n", + "print(dist_vals.shape)\n", + "print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29620d93", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27fea4f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s] \n" + ] + } + ], + "source": [ + "import fiftyone as fo\n", + "import fiftyone.brain as fob\n", + "import numpy as np\n", + "from sklearn.mixture import GaussianMixture\n", + "import json\n", + "\n", + "DATASET_NAME = \"mock\"\n", + "\n", + "json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n", + "\n", + "with open(json_path, \"r\") as file:\n", + " embedding_data = json.load(file)\n", + "\n", + "file_paths = []\n", + "embeddings = []\n", + "for i, record in enumerate(embedding_data):\n", + " file_paths.append(record.get(\"filepath\"))\n", + " embeddings.append(record.get(\"embedding\"))\n", + "\n", + "if DATASET_NAME in fo.list_datasets():\n", + " dataset = fo.load_dataset(DATASET_NAME)\n", + " dataset.delete()\n", + "dataset = fo.Dataset(DATASET_NAME)\n", + "\n", + "# Add samples to the dataset\n", + "samples = [fo.Sample(filepath=p) for p in file_paths]\n", + "dataset.add_samples(samples)\n", + "\n", + "# Building Gaussian mixture model (GMM)\n", + "n_gaussians = 50\n", + "gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n", + "gmm.fit(embeddings)\n", + "cluster_labels = gmm.predict(embeddings)\n", + "\n", + "# Adding labeled embeddings to visulization\n", + "dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n", + "for sample, label in zip(dataset, cluster_labels):\n", + " sample[\"gmm_cluster_50_gaussians\"] = int(label)\n", + " sample.save()\n", + "\n", + "n_gaussians = 200\n", + "gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n", + "gmm.fit(embeddings)\n", + "cluster_labels = gmm.predict(embeddings)\n", + "\n", + "# Adding labeled embeddings to visulization\n", + "dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n", + "for sample, label in zip(dataset, cluster_labels):\n", + " sample[\"gmm_cluster_200_gaussians\"] = int(label)\n", + " sample.save()\n", + "\n", + "# --- Visualize the Embeddings with UMAP ---\n", + "# This will compute a 2D representation of your embeddings\n", + "# for visualization.\n", + "res = fob.compute_visualization(\n", + " dataset,\n", + " embeddings=embeddings,\n", + " brain_key=\"qwen_vision_viz\",\n", + " method=\"tsne\",\n", + " verbose=True\n", + ")\n", + "dataset.set_values(\"qwen_umap\", res.current_points)\n", + "\n", + "print(\"UMAP visualization computed. Launch the app to see the plot.\")\n", + "session = fo.launch_app(dataset)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/extract/clustering_example_qwen_Debug.ipynb b/extract/clustering_example_qwen_Debug.ipynb new file mode 100644 index 0000000..6f1421f --- /dev/null +++ b/extract/clustering_example_qwen_Debug.ipynb @@ -0,0 +1,1301 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "59f8a415", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.26it/s]\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel\n", + "# from qwen_vl_utils import process_vision_info\n", + "from PIL import Image\n", + "import os\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "import math\n", + "import torch\n", + "from transformers import AutoTokenizer, AutoModel\n", + "import timm\n", + "\n", + "# --- Configuration ---\n", + "MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\" # You can choose other model sizes\n", + "IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n", + "BATCH_SIZE = 4\n", + "# --- End Configuration ---\n", + "\n", + "# Check for GPU availability\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "print(f\"Using device: {device}\")\n", + "\n", + "# Load the model and processor\n", + "\n", + "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n", + " MODEL_NAME,\n", + " torch_dtype=torch.bfloat16,\n", + " # use_flash_attn=True,\n", + " attn_implementation=\"flash_attention_2\",\n", + " trust_remote_code=True,\n", + " device_map=\"cuda\").eval()\n", + "\n", + "processor = AutoProcessor.from_pretrained(\n", + " MODEL_NAME, \n", + " trust_remote_code=True\n", + " )\n", + "tokenizer = AutoTokenizer.from_pretrained(\n", + " MODEL_NAME, \n", + " trust_remote_code=True, \n", + " use_fast=False\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "69e7b24e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2_5_VLForConditionalGeneration(\n", + " (visual): Qwen2_5_VisionTransformerPretrainedModel(\n", + " (patch_embed): Qwen2_5_VisionPatchEmbed(\n", + " (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)\n", + " )\n", + " (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()\n", + " (blocks): ModuleList(\n", + " (0-31): 32 x Qwen2_5_VLVisionBlock(\n", + " (norm1): Qwen2RMSNorm((1280,), eps=1e-06)\n", + " (norm2): Qwen2RMSNorm((1280,), eps=1e-06)\n", + " (attn): Qwen2_5_VLVisionFlashAttention2(\n", + " (qkv): Linear(in_features=1280, out_features=3840, bias=True)\n", + " (proj): Linear(in_features=1280, out_features=1280, bias=True)\n", + " )\n", + " (mlp): Qwen2_5_VLMLP(\n", + " (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)\n", + " (up_proj): Linear(in_features=1280, out_features=3420, bias=True)\n", + " (down_proj): Linear(in_features=3420, out_features=1280, bias=True)\n", + " (act_fn): SiLU()\n", + " )\n", + " )\n", + " )\n", + " (merger): Qwen2_5_VLPatchMerger(\n", + " (ln_q): Qwen2RMSNorm((1280,), eps=1e-06)\n", + " (mlp): Sequential(\n", + " (0): Linear(in_features=5120, out_features=5120, bias=True)\n", + " (1): GELU(approximate='none')\n", + " (2): Linear(in_features=5120, out_features=2048, bias=True)\n", + " )\n", + " )\n", + " )\n", + " (model): Qwen2_5_VLModel(\n", + " (embed_tokens): Embedding(151936, 2048)\n", + " (layers): ModuleList(\n", + " (0-35): 36 x Qwen2_5_VLDecoderLayer(\n", + " (self_attn): Qwen2_5_VLFlashAttention2(\n", + " (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n", + " (k_proj): Linear(in_features=2048, out_features=256, bias=True)\n", + " (v_proj): Linear(in_features=2048, out_features=256, bias=True)\n", + " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", + " (rotary_emb): Qwen2_5_VLRotaryEmbedding()\n", + " )\n", + " (mlp): Qwen2MLP(\n", + " (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)\n", + " (up_proj): Linear(in_features=2048, out_features=11008, bias=False)\n", + " (down_proj): Linear(in_features=11008, out_features=2048, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)\n", + " (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)\n", + " )\n", + " )\n", + " (norm): Qwen2RMSNorm((2048,), eps=1e-06)\n", + " (rotary_emb): Qwen2_5_VLRotaryEmbedding()\n", + " )\n", + " (lm_head): Linear(in_features=2048, out_features=151936, bias=False)\n", + ")" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0865539d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2_5_VLProcessor:\n", + "- image_processor: Qwen2VLImageProcessor {\n", + " \"do_convert_rgb\": true,\n", + " \"do_normalize\": true,\n", + " \"do_rescale\": true,\n", + " \"do_resize\": true,\n", + " \"image_mean\": [\n", + " 0.48145466,\n", + " 0.4578275,\n", + " 0.40821073\n", + " ],\n", + " \"image_processor_type\": \"Qwen2VLImageProcessor\",\n", + " \"image_std\": [\n", + " 0.26862954,\n", + " 0.26130258,\n", + " 0.27577711\n", + " ],\n", + " \"max_pixels\": 12845056,\n", + " \"merge_size\": 2,\n", + " \"min_pixels\": 3136,\n", + " \"patch_size\": 14,\n", + " \"processor_class\": \"Qwen2_5_VLProcessor\",\n", + " \"resample\": 3,\n", + " \"rescale_factor\": 0.00392156862745098,\n", + " \"size\": {\n", + " \"longest_edge\": 12845056,\n", + " \"shortest_edge\": 3136\n", + " },\n", + " \"temporal_patch_size\": 2\n", + "}\n", + "\n", + "- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n", + "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151657: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151658: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "}\n", + ")\n", + "\n", + "{\n", + " \"processor_class\": \"Qwen2_5_VLProcessor\"\n", + "}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processor" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "abf32acc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2_5_VisionTransformerPretrainedModel(\n", + " (patch_embed): Qwen2_5_VisionPatchEmbed(\n", + " (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)\n", + " )\n", + " (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()\n", + " (blocks): ModuleList(\n", + " (0-31): 32 x Qwen2_5_VLVisionBlock(\n", + " (norm1): Qwen2RMSNorm((1280,), eps=1e-06)\n", + " (norm2): Qwen2RMSNorm((1280,), eps=1e-06)\n", + " (attn): Qwen2_5_VLVisionFlashAttention2(\n", + " (qkv): Linear(in_features=1280, out_features=3840, bias=True)\n", + " (proj): Linear(in_features=1280, out_features=1280, bias=True)\n", + " )\n", + " (mlp): Qwen2_5_VLMLP(\n", + " (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)\n", + " (up_proj): Linear(in_features=1280, out_features=3420, bias=True)\n", + " (down_proj): Linear(in_features=3420, out_features=1280, bias=True)\n", + " (act_fn): SiLU()\n", + " )\n", + " )\n", + " )\n", + " (merger): Qwen2_5_VLPatchMerger(\n", + " (ln_q): Qwen2RMSNorm((1280,), eps=1e-06)\n", + " (mlp): Sequential(\n", + " (0): Linear(in_features=5120, out_features=5120, bias=True)\n", + " (1): GELU(approximate='none')\n", + " (2): Linear(in_features=5120, out_features=2048, bias=True)\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.visual" + ] + }, + { + "cell_type": "markdown", + "id": "8037ae45", + "metadata": {}, + "source": [ + "# preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d41f94bd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "f2ec71a4", + "metadata": {}, + "source": [ + "# Attention pooling\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a404fa19", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "def gem_pool(x, p: float = 3.0, eps: float = 1e-6):\n", + " # x: [B, N, D]\n", + " return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)\n", + "\n", + "@torch.no_grad()\n", + "def image_embedding(pixel_values, model, use_tiling=True):\n", + " # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]\n", + " out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)\n", + " tok = out.last_hidden_state # [T, N, 1024] hoặc [1, N, 1024]\n", + " if tok.dim() == 2: # phòng trường hợp model trả [N, D]\n", + " tok = tok.unsqueeze(0)\n", + "\n", + " # 1) Attention pooling theo token, trong từng tile\n", + " w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1) # [T,N,1]\n", + " attn_tile = (tok * w_tok).sum(dim=1) # [T,1024]\n", + "\n", + " # 2) Các pooling khác theo token\n", + " mean_tile = tok.mean(dim=1) # [T,1024]\n", + " max_tile = tok.max(dim=1).values # [T,1024]\n", + " gem_tile = gem_pool(tok, p=3.0) # [T,1024]\n", + "\n", + " # 3) Attention across-tiles (giữ multi-scale nhưng gọn)\n", + " tile_scores = attn_tile.norm(dim=-1) # [T]\n", + " w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1) # [T,1]\n", + "\n", + " mean_vec = (mean_tile * w_tile).sum(dim=0)\n", + " max_vec = (max_tile * w_tile).sum(dim=0)\n", + " gem_vec = (gem_tile * w_tile).sum(dim=0)\n", + " attn_vec = (attn_tile * w_tile).sum(dim=0)\n", + "\n", + " # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin\n", + " one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0) # [4*1024]\n", + " one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0) # [1, 4096]\n", + " return one_vec.half() # FP16 để tiết kiệm bộ nhớ" + ] + }, + { + "cell_type": "markdown", + "id": "ed35a4ce", + "metadata": {}, + "source": [ + "# pool" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3edf8b67", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "# --- Pooling theo token (trong 1 tile) ---\n", + "def _pool_tokens(tokens: torch.Tensor, how: str = \"mean\") -> torch.Tensor:\n", + " \"\"\"\n", + " tokens: [1, N, D] hoặc [N, D]\n", + " return: [D]\n", + " \"\"\"\n", + " if tokens.dim() == 3: # [1, N, D] -> [N, D]\n", + " tokens = tokens.squeeze(0)\n", + "\n", + " if how == \"mean\":\n", + " v = tokens.mean(dim=0)\n", + " elif how == \"max\":\n", + " v = tokens.max(dim=0).values\n", + " elif how == \"gem\":\n", + " p = 3.0\n", + " v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)\n", + " elif how == \"cls\":\n", + " # chỉ dùng nếu backbone có CLS token ở vị trí đầu\n", + " v = tokens[0]\n", + " else:\n", + " raise ValueError(f\"Unknown pooling: {how}\")\n", + "\n", + " return v\n", + "\n", + "\n", + "@torch.no_grad()\n", + "def image_embedding_global(model, pixel_values: torch.Tensor,\n", + " pool: str = \"mean\",\n", + " normalize: bool = False,\n", + " global_index: int = 0,\n", + " use_projector: bool = False) -> torch.Tensor:\n", + " \"\"\"\n", + " Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.\n", + " - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]\n", + " - global_index: thường = 0 (tile toàn ảnh nằm đầu)\n", + " - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)\n", + " \"\"\"\n", + " model.eval()\n", + " device = next(model.parameters()).device\n", + " x = pixel_values.to(device)\n", + "\n", + " out = model.vision_model(pixel_values=x) # last_hidden_state: [T, N, D] hoặc [1, N, D]\n", + " tok = out.last_hidden_state\n", + "\n", + " # chọn global tile\n", + " if tok.size(0) > 1:\n", + " tok = tok[global_index:global_index+1] # [1, N, D]\n", + "\n", + " # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!\n", + " if use_projector:\n", + " # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)\n", + " in_feat = getattr(model.mlp1[1], \"in_features\", None)\n", + " if in_feat is not None and tok.size(-1) == in_feat:\n", + " tok = model.mlp1(tok) # [1, N, D’]\n", + " else:\n", + " raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n", + "\n", + " v = _pool_tokens(tok, how=pool) # [D]\n", + " if normalize:\n", + " v = F.normalize(v, dim=-1)\n", + " return v.unsqueeze(0) # [1, D]\n", + "\n", + "\n", + "@torch.no_grad()\n", + "def image_embedding_mean(model, pixel_values: torch.Tensor,\n", + " pool: str = \"mean\",\n", + " normalize: bool = True,\n", + " use_projector: bool = False) -> torch.Tensor:\n", + " \"\"\"\n", + " Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:\n", + " (1) pool theo token trong từng tile → [T, D]\n", + " (2) lấy mean across-tiles → [D]\n", + " \"\"\"\n", + " model.eval()\n", + " device = next(model.parameters()).device\n", + " x = pixel_values.to(device)\n", + "\n", + " out = model.vision_model(pixel_values=x)\n", + " tok = out.last_hidden_state # [T, N, D] hoặc [1, N, D]\n", + "\n", + " if use_projector:\n", + " in_feat = getattr(model.mlp1[1], \"in_features\", None)\n", + " if in_feat is not None and tok.size(-1) == in_feat:\n", + " tok = model.mlp1(tok)\n", + " else:\n", + " raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n", + "\n", + " # pool theo token trong từng tile\n", + " T = tok.size(0)\n", + " per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ] # list of [D]\n", + " per_tile = torch.stack(per_tile, dim=0) # [T, D]\n", + "\n", + " # mean across-tiles\n", + " v = per_tile.mean(dim=0) # [D]\n", + " if normalize:\n", + " v = F.normalize(v, dim=-1)\n", + " return v.unsqueeze(0) # [1, D]\n" + ] + }, + { + "cell_type": "markdown", + "id": "613cf001", + "metadata": {}, + "source": [ + "# infer" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cdfdab0e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_image_embedding(path):\n", + " \"\"\"\n", + " Processes a batch of images and extracts their embeddings.\n", + " \"\"\"\n", + " images_pil = []\n", + " valid_paths = []\n", + " if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n", + " try:\n", + " # The processor expects PIL images in RGB format\n", + " images_pil.append(Image.open(path).convert(\"RGB\"))\n", + " # print(path)\n", + " valid_paths.append(path)\n", + " except Exception as e:\n", + " print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n", + "\n", + " if not valid_paths:\n", + " return np.array([]), []\n", + "\n", + " inputs = processor(\n", + " text=[\"\"] * len(images_pil),\n", + " images=images_pil,\n", + " padding=True,\n", + " return_tensors=\"pt\"\n", + " ).to(device)\n", + " \n", + " # embeddings = image_embedding(inputs, model, use_tiling=True)\n", + " embeddings = image_embedding_mean(model, inputs)\n", + " \n", + " return embeddings.to(torch.float16).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cdaebb7b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/2800 [00:03 15\u001b[0m batch_embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mget_image_embedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_paths\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 16\u001b[0m embeddings_list \u001b[38;5;241m=\u001b[39m [emb\u001b[38;5;241m.\u001b[39mtolist() \u001b[38;5;28;01mfor\u001b[39;00m emb \u001b[38;5;129;01min\u001b[39;00m batch_embeddings]\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m path, emb \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(batch_paths, embeddings_list):\n", + "Cell \u001b[0;32mIn[14], line 27\u001b[0m, in \u001b[0;36mget_image_embedding\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 19\u001b[0m inputs \u001b[38;5;241m=\u001b[39m processor(\n\u001b[1;32m 20\u001b[0m text\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mlen\u001b[39m(images_pil),\n\u001b[1;32m 21\u001b[0m images\u001b[38;5;241m=\u001b[39mimages_pil,\n\u001b[1;32m 22\u001b[0m padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 23\u001b[0m return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 24\u001b[0m )\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# embeddings = image_embedding(inputs, model, use_tiling=True)\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mimage_embedding_mean\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mfloat16)\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n", + "File \u001b[0;32m~/sonnh/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[13], line 81\u001b[0m, in \u001b[0;36mimage_embedding_mean\u001b[0;34m(model, pixel_values, pool, normalize, use_projector)\u001b[0m\n\u001b[1;32m 78\u001b[0m device \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(model\u001b[38;5;241m.\u001b[39mparameters())\u001b[38;5;241m.\u001b[39mdevice\n\u001b[1;32m 79\u001b[0m x \u001b[38;5;241m=\u001b[39m pixel_values\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 81\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvision_model\u001b[49m(pixel_values\u001b[38;5;241m=\u001b[39mx)\n\u001b[1;32m 82\u001b[0m tok \u001b[38;5;241m=\u001b[39m out\u001b[38;5;241m.\u001b[39mlast_hidden_state \u001b[38;5;66;03m# [T, N, D] hoặc [1, N, D]\u001b[39;00m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_projector:\n", + "File \u001b[0;32m~/sonnh/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1940\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1938\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1939\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1940\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m 1941\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1942\u001b[0m )\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Qwen2_5_VLForConditionalGeneration' object has no attribute 'vision_model'" + ] + } + ], + "source": [ + "import json\n", + "\n", + "# --- Process all images in the directory ---\n", + "image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n", + "all_embeddings = []\n", + "filepaths = []\n", + "BATCH_SIZE = 1\n", + "\n", + "with open(\"embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\", \"w\") as f:\n", + "\n", + " f.write(\"[\\n\")\n", + " first = True\n", + " for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n", + " batch_paths = image_files[i]\n", + " batch_embeddings = get_image_embedding(batch_paths)\n", + " embeddings_list = [emb.tolist() for emb in batch_embeddings]\n", + " for path, emb in zip(batch_paths, embeddings_list):\n", + " if not first:\n", + " f.write(\",\\n\")\n", + " json.dump({\"filepath\": path, \"embedding\": emb}, f)\n", + " first = False\n", + " f.write(\"\\n]\\n\")\n", + "\n", + "print(\"Embeddings extracted and saved.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2c3e6dd0", + "metadata": {}, + "outputs": [], + "source": [ + "image_path = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "29620d93", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n", + "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User: \n", + "Please describe the image shortly.\n", + "Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address \"104, cours des fossés, 33210 Langon\" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.\n", + "User: \n", + "Please describe the image in detail.\n", + "Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:\n", + "\n", + "- **Name:** Noël Breignaud\n", + "- **Profession:** Ouestopathe (Osteopath)\n", + "- **Address:** 104, cours des fossés, 33210 Lagnon\n", + "- **Phone Number:** Tel. 06 88 70 66 43\n", + "\n", + "On the right side, there are registration and identification numbers:\n", + "\n", + "- **Nº SIRET:** 510 123 631 00010\n", + "- **Nº ADELI:** 330001108\n", + "- **Code APE:** 8690E\n", + "\n", + "The handwritten section of the document is in French and reads:\n", + "\n", + "- \"Déclaire avoir reçu de M. M. (fils) G[obon], Acquitté la somme de 55 €\n", + "Pour 1 consultation en date du 05/04/2024\n", + "N°: 1750460-19212774\"\n", + "\n", + "At the bottom right, there is a signature that appears to be of Noël Breignaud, with a red stamp partially visible, which seems to contain the text \"Noël BREIGNAUD\" and other markings.\n", + "\n", + "The date in the handwritten section is \"05/04/2024,\" indicating the receipt or service provided on that date. The amount mentioned is 55 euros for one consultation.\n" + ] + } + ], + "source": [ + "pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()\n", + "generation_config = dict(max_new_tokens=1024, do_sample=True)\n", + "\n", + "\n", + "\n", + "question = '\\nPlease describe the image shortly.'\n", + "response = model.chat(tokenizer, pixel_values, question, generation_config)\n", + "print(f'User: {question}\\nAssistant: {response}')\n", + "\n", + "# single-image multi-round conversation (单图多轮对话)\n", + "question = '\\nPlease describe the image in detail.'\n", + "response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)\n", + "print(f'User: {question}\\nAssistant: {response}')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "35dc90e0", + "metadata": {}, + "outputs": [], + "source": [ + "vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77f3720a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([7, 1025, 1024])\n" + ] + } + ], + "source": [ + "patch_feats = vout.last_hidden_state # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn\n", + "print(patch_feats.shape)\n", + "# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]\n", + "# Cách an toàn chung: mean-pool\n", + "# img_vec = patch_feats.mean(dim=1) # [B, Dv]\n", + "# img_vec = torch.nn.functional.normalize(img_vec, dim=-1) # L2 normalize cho retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0043634c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([7, 1024])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# img_vec.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92032162", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ede95852", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "InternVLChatModel(\n", + " (vision_model): InternVisionModel(\n", + " (embeddings): InternVisionEmbeddings(\n", + " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n", + " )\n", + " (encoder): InternVisionEncoder(\n", + " (layers): ModuleList(\n", + " (0): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): Identity()\n", + " (drop_path2): Identity()\n", + " )\n", + " (1): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.004)\n", + " (drop_path2): DropPath(drop_prob=0.004)\n", + " )\n", + " (2): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.009)\n", + " (drop_path2): DropPath(drop_prob=0.009)\n", + " )\n", + " (3): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.013)\n", + " (drop_path2): DropPath(drop_prob=0.013)\n", + " )\n", + " (4): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.017)\n", + " (drop_path2): DropPath(drop_prob=0.017)\n", + " )\n", + " (5): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.022)\n", + " (drop_path2): DropPath(drop_prob=0.022)\n", + " )\n", + " (6): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.026)\n", + " (drop_path2): DropPath(drop_prob=0.026)\n", + " )\n", + " (7): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.031)\n", + " (drop_path2): DropPath(drop_prob=0.031)\n", + " )\n", + " (8): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.035)\n", + " (drop_path2): DropPath(drop_prob=0.035)\n", + " )\n", + " (9): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.039)\n", + " (drop_path2): DropPath(drop_prob=0.039)\n", + " )\n", + " (10): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.044)\n", + " (drop_path2): DropPath(drop_prob=0.044)\n", + " )\n", + " (11): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.048)\n", + " (drop_path2): DropPath(drop_prob=0.048)\n", + " )\n", + " (12): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.052)\n", + " (drop_path2): DropPath(drop_prob=0.052)\n", + " )\n", + " (13): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.056)\n", + " (drop_path2): DropPath(drop_prob=0.056)\n", + " )\n", + " (14): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.061)\n", + " (drop_path2): DropPath(drop_prob=0.061)\n", + " )\n", + " (15): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.065)\n", + " (drop_path2): DropPath(drop_prob=0.065)\n", + " )\n", + " (16): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.069)\n", + " (drop_path2): DropPath(drop_prob=0.069)\n", + " )\n", + " (17): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.074)\n", + " (drop_path2): DropPath(drop_prob=0.074)\n", + " )\n", + " (18): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.078)\n", + " (drop_path2): DropPath(drop_prob=0.078)\n", + " )\n", + " (19): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.083)\n", + " (drop_path2): DropPath(drop_prob=0.083)\n", + " )\n", + " (20): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.087)\n", + " (drop_path2): DropPath(drop_prob=0.087)\n", + " )\n", + " (21): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.091)\n", + " (drop_path2): DropPath(drop_prob=0.091)\n", + " )\n", + " (22): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.096)\n", + " (drop_path2): DropPath(drop_prob=0.096)\n", + " )\n", + " (23): InternVisionEncoderLayer(\n", + " (attn): InternAttention(\n", + " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", + " (attn_drop): Dropout(p=0.0, inplace=False)\n", + " (proj_drop): Dropout(p=0.0, inplace=False)\n", + " (inner_attn): FlashAttention()\n", + " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", + " )\n", + " (mlp): InternMLP(\n", + " (act): GELUActivation()\n", + " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", + " )\n", + " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", + " (drop_path1): DropPath(drop_prob=0.100)\n", + " (drop_path2): DropPath(drop_prob=0.100)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (language_model): Qwen3ForCausalLM(\n", + " (model): Qwen3Model(\n", + " (embed_tokens): Embedding(151936, 2560)\n", + " (layers): ModuleList(\n", + " (0-35): 36 x Qwen3DecoderLayer(\n", + " (self_attn): Qwen3Attention(\n", + " (q_proj): Linear(in_features=2560, out_features=4096, bias=False)\n", + " (k_proj): Linear(in_features=2560, out_features=1024, bias=False)\n", + " (v_proj): Linear(in_features=2560, out_features=1024, bias=False)\n", + " (o_proj): Linear(in_features=4096, out_features=2560, bias=False)\n", + " (q_norm): Qwen3RMSNorm((128,), eps=1e-06)\n", + " (k_norm): Qwen3RMSNorm((128,), eps=1e-06)\n", + " )\n", + " (mlp): Qwen3MLP(\n", + " (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)\n", + " (up_proj): Linear(in_features=2560, out_features=9728, bias=False)\n", + " (down_proj): Linear(in_features=9728, out_features=2560, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n", + " (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n", + " )\n", + " )\n", + " (norm): Qwen3RMSNorm((2560,), eps=1e-06)\n", + " (rotary_emb): Qwen3RotaryEmbedding()\n", + " )\n", + " (lm_head): Linear(in_features=2560, out_features=151936, bias=False)\n", + " )\n", + " (mlp1): Sequential(\n", + " (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n", + " (1): Linear(in_features=4096, out_features=2560, bias=True)\n", + " (2): GELU(approximate='none')\n", + " (3): Linear(in_features=2560, out_features=2560, bias=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27fea4f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s] \n" + ] + } + ], + "source": [ + "import fiftyone as fo\n", + "import fiftyone.brain as fob\n", + "import numpy as np\n", + "from sklearn.mixture import GaussianMixture\n", + "import json\n", + "\n", + "DATASET_NAME = \"mock\"\n", + "\n", + "json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n", + "\n", + "with open(json_path, \"r\") as file:\n", + " embedding_data = json.load(file)\n", + "\n", + "file_paths = []\n", + "embeddings = []\n", + "for i, record in enumerate(embedding_data):\n", + " file_paths.append(record.get(\"filepath\"))\n", + " embeddings.append(record.get(\"embedding\"))\n", + "\n", + "if DATASET_NAME in fo.list_datasets():\n", + " dataset = fo.load_dataset(DATASET_NAME)\n", + " dataset.delete()\n", + "dataset = fo.Dataset(DATASET_NAME)\n", + "\n", + "# Add samples to the dataset\n", + "samples = [fo.Sample(filepath=p) for p in file_paths]\n", + "dataset.add_samples(samples)\n", + "\n", + "# Building Gaussian mixture model (GMM)\n", + "n_gaussians = 50\n", + "gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n", + "gmm.fit(embeddings)\n", + "cluster_labels = gmm.predict(embeddings)\n", + "\n", + "# Adding labeled embeddings to visulization\n", + "dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n", + "for sample, label in zip(dataset, cluster_labels):\n", + " sample[\"gmm_cluster_50_gaussians\"] = int(label)\n", + " sample.save()\n", + "\n", + "n_gaussians = 200\n", + "gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n", + "gmm.fit(embeddings)\n", + "cluster_labels = gmm.predict(embeddings)\n", + "\n", + "# Adding labeled embeddings to visulization\n", + "dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n", + "for sample, label in zip(dataset, cluster_labels):\n", + " sample[\"gmm_cluster_200_gaussians\"] = int(label)\n", + " sample.save()\n", + "\n", + "# --- Visualize the Embeddings with UMAP ---\n", + "# This will compute a 2D representation of your embeddings\n", + "# for visualization.\n", + "res = fob.compute_visualization(\n", + " dataset,\n", + " embeddings=embeddings,\n", + " brain_key=\"qwen_vision_viz\",\n", + " method=\"tsne\",\n", + " verbose=True\n", + ")\n", + "dataset.set_values(\"qwen_umap\", res.current_points)\n", + "\n", + "print(\"UMAP visualization computed. Launch the app to see the plot.\")\n", + "session = fo.launch_app(dataset)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/extract/extract.py b/extract/extract.py new file mode 100644 index 0000000..a53904f --- /dev/null +++ b/extract/extract.py @@ -0,0 +1,90 @@ +import torch +from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModel +# from qwen_vl_utils import process_vision_info +from PIL import Image +import os +import numpy as np +import json +from tqdm import tqdm + +from transformers import LayoutLMv3ImageProcessor, LayoutLMv3Model + + +# --- Configuration --- +MODEL_NAME = "microsoft/layoutlmv3-base" # You can choose other model sizes +IMAGE_DIR = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/" +BATCH_SIZE = 8 +# --- End Configuration --- + +# Check for GPU availability +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using device: {device}") + +# Load the model and processor +# model = AutoModel.from_pretrained( +# MODEL_NAME, torch_dtype="bfloat16", device_map="cuda" # , attn_implementation="flash_attention_2", +# ) + + +model = LayoutLMv3Model.from_pretrained(MODEL_NAME, device_map="cuda") +processor = LayoutLMv3ImageProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) + + +def get_image_embeddings(image_paths): + """ + Processes a batch of images and extracts their embeddings. + """ + images_pil = [] + valid_paths = [] + for path in image_paths: + if path.lower().endswith(('.png', '.jpg', '.jpeg')): + try: + # The processor expects PIL images in RGB format + images_pil.append(Image.open(path).convert("RGB")) + valid_paths.append(path) + except Exception as e: + print(f"Warning: Could not load image {path}. Skipping. Error: {e}") + + if not images_pil: + return np.array([]), [] + + # For pure vision feature extraction, we can provide an empty text prompt. + # The processor handles tokenizing text and preparing images. + # LayoutLMv3 expects 224x224 images by default + inputs = processor( + # text=[""] * len(images_pil), + images=images_pil, + # padding=True, + size = {"height" : 224, "width": 224}, + return_tensors="pt" + ).to(device) + + with torch.no_grad(): + # Get the vision embeddings from the model's vision tower + vision_outputs = model.forward(pixel_values=inputs['pixel_values'].to(dtype=model.dtype)) # , grid_thw=inputs['image_grid_thw']) + # We'll use the pooled output as the embedding + embeddings = vision_outputs[0][:,0,:] + + return embeddings.to(torch.float16).cpu().numpy() + + +# --- Process all images in the directory --- +image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))] +all_embeddings = [] +filepaths = [] + +with open("embeddings_factures_ostepoathie_1k.json", "w") as f: + f.write("[\n") + first = True + for i in tqdm(range(0, len(image_files), BATCH_SIZE)): + batch_paths = image_files[i:i+BATCH_SIZE] + batch_embeddings = get_image_embeddings(batch_paths) + embeddings_list = [emb.tolist() for emb in batch_embeddings] + for path, emb in zip(batch_paths, embeddings_list): + if not first: + f.write(",\n") + json.dump({"filepath": path, "embedding": emb}, f) + first = False + f.write("\n]\n") + +print("Embeddings extracted and saved.") \ No newline at end of file diff --git a/extract/extract_donut.py b/extract/extract_donut.py new file mode 100644 index 0000000..385c650 --- /dev/null +++ b/extract/extract_donut.py @@ -0,0 +1,201 @@ +import torch +from transformers import DonutProcessor, VisionEncoderDecoderModel +from PIL import Image +import os +import numpy as np +import json +from tqdm import tqdm + + +# --- Configuration --- +MODEL_NAME = "naver-clova-ix/donut-base-finetuned-docvqa" # Donut model for document VQA +IMAGE_DIR = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/" +BATCH_SIZE = 4 # Smaller batch size for Donut as it's memory intensive +# --- End Configuration --- + +# Check for GPU availability +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using device: {device}") + +# Load the Donut model and processor +print("Loading Donut model and processor...") +processor = DonutProcessor.from_pretrained(MODEL_NAME) +model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME) +model.to(device) +model.eval() + +# Set model to half precision for efficiency if using GPU +if device == "cuda": + model = model.half() + + +def get_document_embeddings(image_paths): + """ + Processes a batch of document images and extracts their embeddings using Donut. + Uses the encoder part of the VisionEncoderDecoder model to get visual representations. + """ + images_pil = [] + valid_paths = [] + + for path in image_paths: + if path.lower().endswith(('.png', '.jpg', '.jpeg')): + try: + # Load and convert image to RGB + image = Image.open(path).convert("RGB") + images_pil.append(image) + valid_paths.append(path) + except Exception as e: + print(f"Warning: Could not load image {path}. Skipping. Error: {e}") + + if not images_pil: + return np.array([]), [] + + embeddings_list = [] + + # Process images one by one to avoid memory issues + for image in images_pil: + try: + # Preprocess the image + pixel_values = processor(image, return_tensors="pt").pixel_values + pixel_values = pixel_values.to(device) + + if device == "cuda": + pixel_values = pixel_values.half() + + with torch.no_grad(): + # Get encoder outputs (visual features) + encoder_outputs = model.encoder(pixel_values=pixel_values) + + # Use the last hidden state and apply global average pooling + # to get a fixed-size representation + last_hidden_state = encoder_outputs.last_hidden_state # [batch_size, seq_len, hidden_size] + + # Global average pooling across the sequence dimension + embedding = torch.mean(last_hidden_state, dim=1) # [batch_size, hidden_size] + + embeddings_list.append(embedding.squeeze().cpu().float().numpy()) + + except Exception as e: + print(f"Warning: Could not process image. Error: {e}") + # Add zero embedding for failed images to maintain consistency + embeddings_list.append(np.zeros(model.config.encoder.hidden_size)) + + return np.array(embeddings_list), valid_paths + + +def extract_document_info(image_path, question="What information is in this document?"): + """ + Extract specific information from a document using Donut's text generation capability. + This function demonstrates how to use Donut for document understanding tasks. + """ + try: + image = Image.open(image_path).convert("RGB") + + # Prepare the task prompt for document VQA + task_prompt = f"{question}" + + # Process the image and prompt + inputs = processor(image, task_prompt, return_tensors="pt") + inputs = {k: v.to(device) for k, v in inputs.items()} + + if device == "cuda": + inputs["pixel_values"] = inputs["pixel_values"].half() + + with torch.no_grad(): + # Generate answer + generated_ids = model.generate( + **inputs, + max_length=512, + early_stopping=True, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + use_cache=True, + num_beams=1, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + return_dict_in_generate=True, + ) + + # Decode the generated answer + decoded_text = processor.batch_decode(generated_ids.sequences)[0] + # Extract the answer part + answer = decoded_text.split("")[-1].replace("", "").strip() + + return answer + + except Exception as e: + print(f"Error extracting info from {image_path}: {e}") + return "" + + +# --- Process all images in the directory --- +print("Scanning for image files...") +image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) + if f.lower().endswith(('.png', '.jpg', '.jpeg'))] +print(f"Found {len(image_files)} image files") + +all_embeddings = [] +filepaths = [] + +# Extract embeddings and save to JSON +print("Extracting embeddings using Donut...") +with open("embeddings_factures_donut.json", "w") as f: + f.write("[\n") + first = True + + for i in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"): + batch_paths = image_files[i:i+BATCH_SIZE] + batch_embeddings, valid_paths = get_document_embeddings(batch_paths) + + if len(batch_embeddings) > 0: + embeddings_list = [emb.tolist() for emb in batch_embeddings] + + for path, emb in zip(valid_paths, embeddings_list): + if not first: + f.write(",\n") + + entry = { + "filepath": path, + "embedding": emb, + "model": "donut-base-finetuned-docvqa", + "embedding_size": len(emb) + } + + json.dump(entry, f) + first = False + + f.write("\n]\n") + +print("Embeddings extracted and saved to 'embeddings_factures_donut.json'") + +# Optional: Extract some sample document information +print("\nExtracting sample document information...") +sample_images = image_files[:3] # Process first 3 images as samples + +sample_info = [] +for img_path in sample_images: + print(f"Processing: {os.path.basename(img_path)}") + + # Extract different types of information + questions = [ + "What is the total amount?", + "What is the invoice number?", + "What is the date?", + "Who is the vendor?", + "What are the main items?" + ] + + info = {"filepath": img_path, "extracted_info": {}} + + for question in questions: + answer = extract_document_info(img_path, question) + info["extracted_info"][question] = answer + print(f" {question}: {answer}") + + sample_info.append(info) + +# Save sample extraction results +with open("donut_sample_extractions.json", "w") as f: + json.dump(sample_info, f, indent=2, ensure_ascii=False) + +print("Sample document information extracted and saved to 'donut_sample_extractions.json'") +print("Processing completed!") diff --git a/extract/test.ipynb b/extract/test.ipynb new file mode 100644 index 0000000..b9193e1 --- /dev/null +++ b/extract/test.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "a314a8ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0. , 0. ],\n", + " [0.26726124, 0.56694671]])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import numpy as np\n", + "X = [[0, 0, 0], [1, 2, 3]]\n", + "Y = [[1, 0, 0], [1, 1, 0]]\n", + "cosine_similarity(X, Y)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4b560c4f", + "metadata": {}, + "outputs": [], + "source": [ + "sims = cosine_similarity(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8d5d17a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a1098a5a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(3, 3)\n", + "(array([0, 0, 1]), array([1, 2, 2]))\n", + "(3,)\n", + "mean sim: -0.3333333333333334 std: 0.47140452079103173\n" + ] + } + ], + "source": [ + "# X = np.array([\n", + "# [0, 0, 0], \n", + "# [-1, 100, -1000],\n", + "# [-1, -2, -4]\n", + "# ]\n", + "# )\n", + "\n", + "X = np.array([\n", + " [0, 0, 0], \n", + " [1,1,1],\n", + " [-1, -1, -1]\n", + " ]\n", + " )\n", + "print(X.shape)\n", + "sims = cosine_similarity(X)\n", + "\n", + "triu_idxs = np.triu_indices_from(sims, k=1)\n", + "print(triu_idxs)\n", + "dist_vals = sims[triu_idxs]\n", + "print(dist_vals.shape)\n", + "print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "2dacad18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dist_vals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76d25e07", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}