check visison extract model
This commit is contained in:
874
extract/clustering_example_InternVL3_5.ipynb
Normal file
874
extract/clustering_example_InternVL3_5.ipynb
Normal file
@@ -0,0 +1,874 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "59f8a415",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/nguyendc/sonnh/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-09-02 13:50:30.358544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
||||
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
||||
"E0000 00:00:1756821030.369428 3858431 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
||||
"E0000 00:00:1756821030.372761 3858431 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
||||
"W0000 00:00:1756821030.382108 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1756821030.382119 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1756821030.382121 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1756821030.382123 3858431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"2025-09-02 13:50:30.385619: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
||||
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[2025-09-02 13:50:35,304] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
|
||||
"collect2: error: ld returned 1 exit status\n",
|
||||
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
|
||||
"collect2: error: ld returned 1 exit status\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using device: cuda\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
|
||||
"- configuration_intern_vit.py\n",
|
||||
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
|
||||
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
|
||||
"- configuration_internvl_chat.py\n",
|
||||
"- configuration_intern_vit.py\n",
|
||||
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
|
||||
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
|
||||
"- conversation.py\n",
|
||||
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
|
||||
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
|
||||
"- modeling_intern_vit.py\n",
|
||||
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
|
||||
"A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL3_5-4B-Instruct:\n",
|
||||
"- modeling_internvl_chat.py\n",
|
||||
"- conversation.py\n",
|
||||
"- modeling_intern_vit.py\n",
|
||||
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
|
||||
"Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 7.56it/s]\n",
|
||||
"Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00, 1.03s/it]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel, InternVLChatModel\n",
|
||||
"# from qwen_vl_utils import process_vision_info\n",
|
||||
"from PIL import Image\n",
|
||||
"import os\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import math\n",
|
||||
"import torch\n",
|
||||
"from transformers import AutoTokenizer, AutoModel\n",
|
||||
"import timm\n",
|
||||
"\n",
|
||||
"# --- Configuration ---\n",
|
||||
"# MODEL_NAME = \"OpenGVLab/InternVL3_5-4B\" # You can choose other model sizes\n",
|
||||
"MODEL_NAME = \"OpenGVLab/InternVL3_5-4B-Instruct\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",
|
||||
"BATCH_SIZE = 4\n",
|
||||
"# --- End Configuration ---\n",
|
||||
"\n",
|
||||
"# Check for GPU availability\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"print(f\"Using device: {device}\")\n",
|
||||
"\n",
|
||||
"# Load the model and processor\n",
|
||||
"\n",
|
||||
"model = AutoModel.from_pretrained(\n",
|
||||
" MODEL_NAME,\n",
|
||||
" torch_dtype=torch.bfloat16,\n",
|
||||
" use_flash_attn=True,\n",
|
||||
" attn_implementation=\"flash_attention_2\",\n",
|
||||
" trust_remote_code=True,\n",
|
||||
" device_map=\"cuda\").eval()\n",
|
||||
"\n",
|
||||
"processor = AutoProcessor.from_pretrained(\n",
|
||||
" MODEL_NAME, \n",
|
||||
" trust_remote_code=True\n",
|
||||
" )\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\n",
|
||||
" MODEL_NAME, \n",
|
||||
" trust_remote_code=True, \n",
|
||||
" use_fast=False\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "6d826d19",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"InternVLChatModel(\n",
|
||||
" (vision_model): InternVisionModel(\n",
|
||||
" (embeddings): InternVisionEmbeddings(\n",
|
||||
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n",
|
||||
" )\n",
|
||||
" (encoder): InternVisionEncoder(\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-23): 24 x InternVisionEncoderLayer(\n",
|
||||
" (attn): InternAttention(\n",
|
||||
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
|
||||
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
|
||||
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
|
||||
" (inner_attn): FlashAttention()\n",
|
||||
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (mlp): InternMLP(\n",
|
||||
" (act): GELUActivation()\n",
|
||||
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
||||
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
|
||||
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
|
||||
" (drop_path1): Identity()\n",
|
||||
" (drop_path2): Identity()\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (language_model): Qwen3ForCausalLM(\n",
|
||||
" (model): Qwen3Model(\n",
|
||||
" (embed_tokens): Embedding(151936, 2560)\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-35): 36 x Qwen3DecoderLayer(\n",
|
||||
" (self_attn): Qwen3Attention(\n",
|
||||
" (q_proj): Linear(in_features=2560, out_features=4096, bias=False)\n",
|
||||
" (k_proj): Linear(in_features=2560, out_features=1024, bias=False)\n",
|
||||
" (v_proj): Linear(in_features=2560, out_features=1024, bias=False)\n",
|
||||
" (o_proj): Linear(in_features=4096, out_features=2560, bias=False)\n",
|
||||
" (q_norm): Qwen3RMSNorm((128,), eps=1e-06)\n",
|
||||
" (k_norm): Qwen3RMSNorm((128,), eps=1e-06)\n",
|
||||
" )\n",
|
||||
" (mlp): Qwen3MLP(\n",
|
||||
" (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)\n",
|
||||
" (up_proj): Linear(in_features=2560, out_features=9728, bias=False)\n",
|
||||
" (down_proj): Linear(in_features=9728, out_features=2560, bias=False)\n",
|
||||
" (act_fn): SiLU()\n",
|
||||
" )\n",
|
||||
" (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
|
||||
" (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (norm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
|
||||
" (rotary_emb): Qwen3RotaryEmbedding()\n",
|
||||
" )\n",
|
||||
" (lm_head): Linear(in_features=2560, out_features=151936, bias=False)\n",
|
||||
" )\n",
|
||||
" (mlp1): Sequential(\n",
|
||||
" (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (1): Linear(in_features=4096, out_features=2560, bias=True)\n",
|
||||
" (2): GELU(approximate='none')\n",
|
||||
" (3): Linear(in_features=2560, out_features=2560, bias=True)\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "7bbfcf47",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"InternVisionModel(\n",
|
||||
" (embeddings): InternVisionEmbeddings(\n",
|
||||
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n",
|
||||
" )\n",
|
||||
" (encoder): InternVisionEncoder(\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-23): 24 x InternVisionEncoderLayer(\n",
|
||||
" (attn): InternAttention(\n",
|
||||
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
|
||||
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
|
||||
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
|
||||
" (inner_attn): FlashAttention()\n",
|
||||
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (mlp): InternMLP(\n",
|
||||
" (act): GELUActivation()\n",
|
||||
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
||||
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
|
||||
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
|
||||
" (drop_path1): Identity()\n",
|
||||
" (drop_path2): Identity()\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model.vision_model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae26d6cf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# demo ?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "817d3ccb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d41f94bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import math\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import torchvision.transforms as T\n",
|
||||
"# from decord import VideoReader, cpu\n",
|
||||
"from PIL import Image\n",
|
||||
"from torchvision.transforms.functional import InterpolationMode\n",
|
||||
"# from modelscope import AutoModel, AutoTokenizer\n",
|
||||
"\n",
|
||||
"IMAGENET_MEAN = (0.485, 0.456, 0.406)\n",
|
||||
"IMAGENET_STD = (0.229, 0.224, 0.225)\n",
|
||||
"\n",
|
||||
"def build_transform(input_size):\n",
|
||||
" MEAN, STD = IMAGENET_MEAN, IMAGENET_STD\n",
|
||||
" transform = T.Compose([\n",
|
||||
" T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),\n",
|
||||
" T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),\n",
|
||||
" T.ToTensor(),\n",
|
||||
" T.Normalize(mean=MEAN, std=STD)\n",
|
||||
" ])\n",
|
||||
" return transform\n",
|
||||
"\n",
|
||||
"def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):\n",
|
||||
" best_ratio_diff = float('inf')\n",
|
||||
" best_ratio = (1, 1)\n",
|
||||
" area = width * height\n",
|
||||
" for ratio in target_ratios:\n",
|
||||
" target_aspect_ratio = ratio[0] / ratio[1]\n",
|
||||
" ratio_diff = abs(aspect_ratio - target_aspect_ratio)\n",
|
||||
" if ratio_diff < best_ratio_diff:\n",
|
||||
" best_ratio_diff = ratio_diff\n",
|
||||
" best_ratio = ratio\n",
|
||||
" elif ratio_diff == best_ratio_diff:\n",
|
||||
" if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:\n",
|
||||
" best_ratio = ratio\n",
|
||||
" return best_ratio\n",
|
||||
"\n",
|
||||
"def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):\n",
|
||||
" orig_width, orig_height = image.size\n",
|
||||
" aspect_ratio = orig_width / orig_height\n",
|
||||
"\n",
|
||||
" # calculate the existing image aspect ratio\n",
|
||||
" target_ratios = set(\n",
|
||||
" (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if\n",
|
||||
" i * j <= max_num and i * j >= min_num)\n",
|
||||
" target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])\n",
|
||||
"\n",
|
||||
" # find the closest aspect ratio to the target\n",
|
||||
" target_aspect_ratio = find_closest_aspect_ratio(\n",
|
||||
" aspect_ratio, target_ratios, orig_width, orig_height, image_size)\n",
|
||||
"\n",
|
||||
" # calculate the target width and height\n",
|
||||
" target_width = image_size * target_aspect_ratio[0]\n",
|
||||
" target_height = image_size * target_aspect_ratio[1]\n",
|
||||
" blocks = target_aspect_ratio[0] * target_aspect_ratio[1]\n",
|
||||
"\n",
|
||||
" # resize the image\n",
|
||||
" resized_img = image.resize((target_width, target_height))\n",
|
||||
" processed_images = []\n",
|
||||
" for i in range(blocks):\n",
|
||||
" box = (\n",
|
||||
" (i % (target_width // image_size)) * image_size,\n",
|
||||
" (i // (target_width // image_size)) * image_size,\n",
|
||||
" ((i % (target_width // image_size)) + 1) * image_size,\n",
|
||||
" ((i // (target_width // image_size)) + 1) * image_size\n",
|
||||
" )\n",
|
||||
" # split the image\n",
|
||||
" split_img = resized_img.crop(box)\n",
|
||||
" processed_images.append(split_img)\n",
|
||||
" assert len(processed_images) == blocks\n",
|
||||
" if use_thumbnail and len(processed_images) != 1:\n",
|
||||
" thumbnail_img = image.resize((image_size, image_size))\n",
|
||||
" processed_images.append(thumbnail_img)\n",
|
||||
" return processed_images\n",
|
||||
"\n",
|
||||
"def load_image(image_file, input_size=448, max_num=12):\n",
|
||||
" image = Image.open(image_file).convert('RGB')\n",
|
||||
" transform = build_transform(input_size=input_size)\n",
|
||||
" images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)\n",
|
||||
" pixel_values = [transform(image) for image in images]\n",
|
||||
" pixel_values = torch.stack(pixel_values)\n",
|
||||
" return pixel_values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f2ec71a4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Attention pooling\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "a404fa19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"\n",
|
||||
"def gem_pool(x, p: float = 3.0, eps: float = 1e-6):\n",
|
||||
" # x: [B, N, D]\n",
|
||||
" return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)\n",
|
||||
"\n",
|
||||
"@torch.no_grad()\n",
|
||||
"def image_embedding(pixel_values, model, use_tiling=True):\n",
|
||||
" # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]\n",
|
||||
" out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)\n",
|
||||
" tok = out.last_hidden_state # [T, N, 1024] hoặc [1, N, 1024]\n",
|
||||
" if tok.dim() == 2: # phòng trường hợp model trả [N, D]\n",
|
||||
" tok = tok.unsqueeze(0)\n",
|
||||
"\n",
|
||||
" # 1) Attention pooling theo token, trong từng tile\n",
|
||||
" w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1) # [T,N,1]\n",
|
||||
" attn_tile = (tok * w_tok).sum(dim=1) # [T,1024]\n",
|
||||
"\n",
|
||||
" # 2) Các pooling khác theo token\n",
|
||||
" mean_tile = tok.mean(dim=1) # [T,1024]\n",
|
||||
" max_tile = tok.max(dim=1).values # [T,1024]\n",
|
||||
" gem_tile = gem_pool(tok, p=3.0) # [T,1024]\n",
|
||||
"\n",
|
||||
" # 3) Attention across-tiles (giữ multi-scale nhưng gọn)\n",
|
||||
" tile_scores = attn_tile.norm(dim=-1) # [T]\n",
|
||||
" w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1) # [T,1]\n",
|
||||
"\n",
|
||||
" mean_vec = (mean_tile * w_tile).sum(dim=0)\n",
|
||||
" max_vec = (max_tile * w_tile).sum(dim=0)\n",
|
||||
" gem_vec = (gem_tile * w_tile).sum(dim=0)\n",
|
||||
" attn_vec = (attn_tile * w_tile).sum(dim=0)\n",
|
||||
"\n",
|
||||
" # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin\n",
|
||||
" one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0) # [4*1024]\n",
|
||||
" one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0) # [1, 4096]\n",
|
||||
" return one_vec.half() # FP16 để tiết kiệm bộ nhớ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed35a4ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# pool"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "3edf8b67",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"\n",
|
||||
"# --- Pooling theo token (trong 1 tile) ---\n",
|
||||
"def _pool_tokens(tokens: torch.Tensor, how: str = \"mean\") -> torch.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" tokens: [1, N, D] hoặc [N, D]\n",
|
||||
" return: [D]\n",
|
||||
" \"\"\"\n",
|
||||
" if tokens.dim() == 3: # [1, N, D] -> [N, D]\n",
|
||||
" tokens = tokens.squeeze(0)\n",
|
||||
"\n",
|
||||
" if how == \"mean\":\n",
|
||||
" v = tokens.mean(dim=0)\n",
|
||||
" elif how == \"max\":\n",
|
||||
" v = tokens.max(dim=0).values\n",
|
||||
" elif how == \"gem\":\n",
|
||||
" p = 3.0\n",
|
||||
" v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)\n",
|
||||
" elif how == \"cls\":\n",
|
||||
" # chỉ dùng nếu backbone có CLS token ở vị trí đầu\n",
|
||||
" v = tokens[0]\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Unknown pooling: {how}\")\n",
|
||||
"\n",
|
||||
" return v\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@torch.no_grad()\n",
|
||||
"def image_embedding_global(model, pixel_values: torch.Tensor,\n",
|
||||
" pool: str = \"mean\",\n",
|
||||
" normalize: bool = False,\n",
|
||||
" global_index: int = 0,\n",
|
||||
" use_projector: bool = False) -> torch.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.\n",
|
||||
" - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]\n",
|
||||
" - global_index: thường = 0 (tile toàn ảnh nằm đầu)\n",
|
||||
" - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)\n",
|
||||
" \"\"\"\n",
|
||||
" model.eval()\n",
|
||||
" device = next(model.parameters()).device\n",
|
||||
" x = pixel_values.to(device)\n",
|
||||
"\n",
|
||||
" out = model.vision_model(pixel_values=x) # last_hidden_state: [T, N, D] hoặc [1, N, D]\n",
|
||||
" tok = out.last_hidden_state\n",
|
||||
"\n",
|
||||
" # chọn global tile\n",
|
||||
" if tok.size(0) > 1:\n",
|
||||
" tok = tok[global_index:global_index+1] # [1, N, D]\n",
|
||||
"\n",
|
||||
" # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!\n",
|
||||
" if use_projector:\n",
|
||||
" # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)\n",
|
||||
" in_feat = getattr(model.mlp1[1], \"in_features\", None)\n",
|
||||
" if in_feat is not None and tok.size(-1) == in_feat:\n",
|
||||
" tok = model.mlp1(tok) # [1, N, D’]\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n",
|
||||
"\n",
|
||||
" v = _pool_tokens(tok, how=pool) # [D]\n",
|
||||
" if normalize:\n",
|
||||
" v = F.normalize(v, dim=-1)\n",
|
||||
" return v.unsqueeze(0) # [1, D]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@torch.no_grad()\n",
|
||||
"def image_embedding_mean(model, pixel_values: torch.Tensor,\n",
|
||||
" pool: str = \"mean\",\n",
|
||||
" normalize: bool = True,\n",
|
||||
" use_projector: bool = False) -> torch.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:\n",
|
||||
" (1) pool theo token trong từng tile → [T, D]\n",
|
||||
" (2) lấy mean across-tiles → [D]\n",
|
||||
" \"\"\"\n",
|
||||
" model.eval()\n",
|
||||
" device = next(model.parameters()).device\n",
|
||||
" x = pixel_values.to(device)\n",
|
||||
"\n",
|
||||
" out = model.vision_model(pixel_values=x)\n",
|
||||
" tok = out.last_hidden_state # [T, N, D] hoặc [1, N, D]\n",
|
||||
"\n",
|
||||
" if use_projector:\n",
|
||||
" in_feat = getattr(model.mlp1[1], \"in_features\", None)\n",
|
||||
" if in_feat is not None and tok.size(-1) == in_feat:\n",
|
||||
" tok = model.mlp1(tok)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n",
|
||||
"\n",
|
||||
" # pool theo token trong từng tile\n",
|
||||
" T = tok.size(0)\n",
|
||||
" per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ] # list of [D]\n",
|
||||
" per_tile = torch.stack(per_tile, dim=0) # [T, D]\n",
|
||||
"\n",
|
||||
" # mean across-tiles\n",
|
||||
" v = per_tile.mean(dim=0) # [D]\n",
|
||||
" if normalize:\n",
|
||||
" v = F.normalize(v, dim=-1)\n",
|
||||
" return v.unsqueeze(0) # [1, D]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "613cf001",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# infer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "cdfdab0e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_image_embedding(path):\n",
|
||||
" \"\"\"\n",
|
||||
" Processes a batch of images and extracts their embeddings.\n",
|
||||
" \"\"\"\n",
|
||||
" images_pil = []\n",
|
||||
" valid_paths = []\n",
|
||||
" if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
|
||||
" try:\n",
|
||||
" # The processor expects PIL images in RGB format\n",
|
||||
" # images_pil.append(Image.open(path).convert(\"RGB\"))\n",
|
||||
" # print(path)\n",
|
||||
" valid_paths.append(path)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",
|
||||
"\n",
|
||||
" if not valid_paths:\n",
|
||||
" return np.array([]), []\n",
|
||||
"\n",
|
||||
" all_pixel_values = []\n",
|
||||
" for valid_path in valid_paths:\n",
|
||||
" pixel_values = load_image(valid_path, max_num=12).to(torch.bfloat16).cuda()\n",
|
||||
" # print(pixel_values.shape)\n",
|
||||
" all_pixel_values.append(pixel_values)\n",
|
||||
" # For pure vision feature extraction, we can provide an empty text prompt.\n",
|
||||
" # The processor handles tokenizing text and preparing images.\n",
|
||||
" inputs = torch.cat(all_pixel_values, dim=0).to(device)\n",
|
||||
" \n",
|
||||
" # embeddings = image_embedding(inputs, model, use_tiling=True)\n",
|
||||
" embeddings = image_embedding_mean(model, inputs)\n",
|
||||
" \n",
|
||||
" return embeddings.to(torch.float16).cpu().numpy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "cdaebb7b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 2800/2800 [20:51<00:00, 2.24it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Embeddings extracted and saved.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# --- Process all images in the directory ---\n",
|
||||
"image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
|
||||
"all_embeddings = []\n",
|
||||
"filepaths = []\n",
|
||||
"BATCH_SIZE = 1\n",
|
||||
"\n",
|
||||
"with open(\"embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\", \"w\") as f:\n",
|
||||
"\n",
|
||||
" f.write(\"[\\n\")\n",
|
||||
" first = True\n",
|
||||
" for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",
|
||||
" batch_paths = image_files[i]\n",
|
||||
" batch_embeddings = get_image_embedding(batch_paths)\n",
|
||||
" embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",
|
||||
" for path, emb in zip(batch_paths, embeddings_list):\n",
|
||||
" if not first:\n",
|
||||
" f.write(\",\\n\")\n",
|
||||
" json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",
|
||||
" first = False\n",
|
||||
" f.write(\"\\n]\\n\")\n",
|
||||
"\n",
|
||||
"print(\"Embeddings extracted and saved.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0d0bf0a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# check"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "0772fc89",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded 2800 samples with embedding dimension 1024\n",
|
||||
"Applied L2 normalization to embeddings\n",
|
||||
"(2800, 1024)\n",
|
||||
"(3918600,)\n",
|
||||
"mean sim: 0.9939966 std: 0.0073577887\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n",
|
||||
"from sklearn.preprocessing import normalize\n",
|
||||
"from sklearn.metrics import silhouette_score\n",
|
||||
"from sklearn.neighbors import NearestNeighbors\n",
|
||||
"from sklearn.decomposition import PCA\n",
|
||||
"import argparse\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\"\n",
|
||||
"with open(embeddings_path, 'r') as f:\n",
|
||||
" data = json.load(f)\n",
|
||||
"\n",
|
||||
"file_paths = []\n",
|
||||
"embeddings_list = []\n",
|
||||
"\n",
|
||||
"for item in data:\n",
|
||||
" file_paths.append(item['filepath'])\n",
|
||||
" embeddings_list.append(item['embedding'])\n",
|
||||
"\n",
|
||||
"embeddings = np.array(embeddings_list, dtype=np.float32)\n",
|
||||
"print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n",
|
||||
"\n",
|
||||
"# Normalize embeddings using L2 normalization for cosine distance\n",
|
||||
"embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n",
|
||||
"print(\"Applied L2 normalization to embeddings\")\n",
|
||||
"\n",
|
||||
"sims = cosine_similarity(embeddings)\n",
|
||||
"print(embeddings.shape)\n",
|
||||
"# lấy upper triangle exclude diagonal để inspect\n",
|
||||
"triu_idxs = np.triu_indices_from(sims, k=1)\n",
|
||||
"dist_vals = sims[triu_idxs]\n",
|
||||
"print(dist_vals.shape)\n",
|
||||
"print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb4ea42b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# temp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "2c3e6dd0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image_path = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "29620d93",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n",
|
||||
"Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"User: <image>\n",
|
||||
"Please describe the image shortly.\n",
|
||||
"Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address \"104, cours des fossés, 33210 Langon\" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.\n",
|
||||
"User: <image>\n",
|
||||
"Please describe the image in detail.\n",
|
||||
"Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:\n",
|
||||
"\n",
|
||||
"- **Name:** Noël Breignaud\n",
|
||||
"- **Profession:** Ouestopathe (Osteopath)\n",
|
||||
"- **Address:** 104, cours des fossés, 33210 Lagnon\n",
|
||||
"- **Phone Number:** Tel. 06 88 70 66 43\n",
|
||||
"\n",
|
||||
"On the right side, there are registration and identification numbers:\n",
|
||||
"\n",
|
||||
"- **Nº SIRET:** 510 123 631 00010\n",
|
||||
"- **Nº ADELI:** 330001108\n",
|
||||
"- **Code APE:** 8690E\n",
|
||||
"\n",
|
||||
"The handwritten section of the document is in French and reads:\n",
|
||||
"\n",
|
||||
"- \"Déclaire avoir reçu de M. M. (fils) G[obon], Acquitté la somme de 55 €\n",
|
||||
"Pour 1 consultation en date du 05/04/2024\n",
|
||||
"N°: 1750460-19212774\"\n",
|
||||
"\n",
|
||||
"At the bottom right, there is a signature that appears to be of Noël Breignaud, with a red stamp partially visible, which seems to contain the text \"Noël BREIGNAUD\" and other markings.\n",
|
||||
"\n",
|
||||
"The date in the handwritten section is \"05/04/2024,\" indicating the receipt or service provided on that date. The amount mentioned is 55 euros for one consultation.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()\n",
|
||||
"generation_config = dict(max_new_tokens=1024, do_sample=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"question = '<image>\\nPlease describe the image shortly.'\n",
|
||||
"response = model.chat(tokenizer, pixel_values, question, generation_config)\n",
|
||||
"print(f'User: {question}\\nAssistant: {response}')\n",
|
||||
"\n",
|
||||
"# single-image multi-round conversation (单图多轮对话)\n",
|
||||
"question = '<image>\\nPlease describe the image in detail.'\n",
|
||||
"response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)\n",
|
||||
"print(f'User: {question}\\nAssistant: {response}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "35dc90e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "77f3720a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([7, 1025, 1024])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"patch_feats = vout.last_hidden_state # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn\n",
|
||||
"print(patch_feats.shape)\n",
|
||||
"# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]\n",
|
||||
"# Cách an toàn chung: mean-pool\n",
|
||||
"# img_vec = patch_feats.mean(dim=1) # [B, Dv]\n",
|
||||
"# img_vec = torch.nn.functional.normalize(img_vec, dim=-1) # L2 normalize cho retrieval"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0043634c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"torch.Size([7, 1024])"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# img_vec.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "92032162",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
432
extract/clustering_example_qwen.ipynb
Normal file
432
extract/clustering_example_qwen.ipynb
Normal file
@@ -0,0 +1,432 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "59f8a415",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-09-02 15:00:12.976185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
||||
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
||||
"E0000 00:00:1756825212.987686 3903757 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
||||
"E0000 00:00:1756825212.991038 3903757 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
||||
"W0000 00:00:1756825213.000855 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1756825213.000880 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1756825213.000882 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1756825213.000884 3903757 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"2025-09-02 15:00:13.005218: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
||||
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[2025-09-02 15:00:17,970] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
|
||||
"collect2: error: ld returned 1 exit status\n",
|
||||
"/usr/bin/ld: cannot find -laio: No such file or directory\n",
|
||||
"collect2: error: ld returned 1 exit status\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using device: cuda\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.09it/s]\n",
|
||||
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
|
||||
"# from qwen_vl_utils import process_vision_info\n",
|
||||
"from PIL import Image\n",
|
||||
"import os\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n",
|
||||
"# --- Configuration ---\n",
|
||||
"MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\" # You can choose other model sizes\n",
|
||||
"\n",
|
||||
"IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",
|
||||
"BATCH_SIZE = 4\n",
|
||||
"# --- End Configuration ---\n",
|
||||
"\n",
|
||||
"# Check for GPU availability\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"print(f\"Using device: {device}\")\n",
|
||||
"\n",
|
||||
"# Load the model and processor\n",
|
||||
"model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
|
||||
" MODEL_NAME, torch_dtype=\"bfloat16\", device_map=\"cuda\", attn_implementation=\"flash_attention_2\",\n",
|
||||
")\n",
|
||||
"processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "13479e1a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Qwen2_5_VLProcessor:\n",
|
||||
"- image_processor: Qwen2VLImageProcessor {\n",
|
||||
" \"do_convert_rgb\": true,\n",
|
||||
" \"do_normalize\": true,\n",
|
||||
" \"do_rescale\": true,\n",
|
||||
" \"do_resize\": true,\n",
|
||||
" \"image_mean\": [\n",
|
||||
" 0.48145466,\n",
|
||||
" 0.4578275,\n",
|
||||
" 0.40821073\n",
|
||||
" ],\n",
|
||||
" \"image_processor_type\": \"Qwen2VLImageProcessor\",\n",
|
||||
" \"image_std\": [\n",
|
||||
" 0.26862954,\n",
|
||||
" 0.26130258,\n",
|
||||
" 0.27577711\n",
|
||||
" ],\n",
|
||||
" \"max_pixels\": 12845056,\n",
|
||||
" \"merge_size\": 2,\n",
|
||||
" \"min_pixels\": 3136,\n",
|
||||
" \"patch_size\": 14,\n",
|
||||
" \"processor_class\": \"Qwen2_5_VLProcessor\",\n",
|
||||
" \"resample\": 3,\n",
|
||||
" \"rescale_factor\": 0.00392156862745098,\n",
|
||||
" \"size\": {\n",
|
||||
" \"longest_edge\": 12845056,\n",
|
||||
" \"shortest_edge\": 3136\n",
|
||||
" },\n",
|
||||
" \"temporal_patch_size\": 2\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
|
||||
"\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
||||
"\t151657: AddedToken(\"<tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151658: AddedToken(\"</tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
|
||||
"}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"processor_class\": \"Qwen2_5_VLProcessor\"\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"processor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cdfdab0e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_image_embeddings(image_paths):\n",
|
||||
" \"\"\"\n",
|
||||
" Processes a batch of images and extracts their embeddings.\n",
|
||||
" \"\"\"\n",
|
||||
" images_pil = []\n",
|
||||
" valid_paths = []\n",
|
||||
" for path in image_paths:\n",
|
||||
" if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
|
||||
" try:\n",
|
||||
" # The processor expects PIL images in RGB format\n",
|
||||
" images_pil.append(Image.open(path).convert(\"RGB\"))\n",
|
||||
" valid_paths.append(path)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",
|
||||
"\n",
|
||||
" if not images_pil:\n",
|
||||
" return np.array([]), []\n",
|
||||
"\n",
|
||||
" # For pure vision feature extraction, we can provide an empty text prompt.\n",
|
||||
" # The processor handles tokenizing text and preparing images.\n",
|
||||
" inputs = processor(\n",
|
||||
" text=[\"\"] * len(images_pil),\n",
|
||||
" images=images_pil,\n",
|
||||
" padding=True,\n",
|
||||
" return_tensors=\"pt\"\n",
|
||||
" ).to(device)\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" # Get the vision embeddings from the model's vision tower\n",
|
||||
" vision_outputs = model.visual(inputs['pixel_values'].to(dtype=model.dtype), grid_thw=inputs['image_grid_thw'])\n",
|
||||
" # We'll use the pooled output as the embedding\n",
|
||||
" embeddings = vision_outputs\n",
|
||||
"\n",
|
||||
" return embeddings.to(torch.float16).cpu().numpy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cdaebb7b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 700/700 [22:12<00:00, 1.90s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Embeddings extracted and saved.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# --- Process all images in the directory ---\n",
|
||||
"image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
|
||||
"all_embeddings = []\n",
|
||||
"filepaths = []\n",
|
||||
"\n",
|
||||
"with open(\"embeddings_factures_osteopathie_1k_qwen.json\", \"w\") as f:\n",
|
||||
"\n",
|
||||
" f.write(\"[\\n\")\n",
|
||||
" first = True\n",
|
||||
" for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",
|
||||
" batch_paths = image_files[i:i+BATCH_SIZE]\n",
|
||||
" batch_embeddings = get_image_embeddings(batch_paths)\n",
|
||||
" embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",
|
||||
" for path, emb in zip(batch_paths, embeddings_list):\n",
|
||||
" if not first:\n",
|
||||
" f.write(\",\\n\")\n",
|
||||
" json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",
|
||||
" first = False\n",
|
||||
" f.write(\"\\n]\\n\")\n",
|
||||
"\n",
|
||||
"print(\"Embeddings extracted and saved.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "2c3e6dd0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded 2800 samples with embedding dimension 2048\n",
|
||||
"Applied L2 normalization to embeddings\n",
|
||||
"(2800, 2048)\n",
|
||||
"(3918600,)\n",
|
||||
"mean sim: 0.37961555 std: 0.22605234\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation\n",
|
||||
"from sklearn.preprocessing import normalize\n",
|
||||
"from sklearn.metrics import silhouette_score\n",
|
||||
"from sklearn.neighbors import NearestNeighbors\n",
|
||||
"from sklearn.decomposition import PCA\n",
|
||||
"import argparse\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
"from datetime import datetime\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"embeddings_path = \"/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json\"\n",
|
||||
"with open(embeddings_path, 'r') as f:\n",
|
||||
" data = json.load(f)\n",
|
||||
"\n",
|
||||
"file_paths = []\n",
|
||||
"embeddings_list = []\n",
|
||||
"\n",
|
||||
"for item in data:\n",
|
||||
" file_paths.append(item['filepath'])\n",
|
||||
" embeddings_list.append(item['embedding'])\n",
|
||||
"\n",
|
||||
"embeddings = np.array(embeddings_list, dtype=np.float32)\n",
|
||||
"print(f\"Loaded {len(file_paths)} samples with embedding dimension {embeddings.shape[1]}\")\n",
|
||||
"\n",
|
||||
"# Normalize embeddings using L2 normalization for cosine distance\n",
|
||||
"embeddings_normalized = normalize(embeddings, norm='l2', axis=1)\n",
|
||||
"print(\"Applied L2 normalization to embeddings\")\n",
|
||||
"\n",
|
||||
"sims = cosine_similarity(embeddings)\n",
|
||||
"print(embeddings.shape)\n",
|
||||
"# lấy upper triangle exclude diagonal để inspect\n",
|
||||
"triu_idxs = np.triu_indices_from(sims, k=1)\n",
|
||||
"dist_vals = sims[triu_idxs]\n",
|
||||
"print(dist_vals.shape)\n",
|
||||
"print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29620d93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "27fea4f3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import fiftyone as fo\n",
|
||||
"import fiftyone.brain as fob\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.mixture import GaussianMixture\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"DATASET_NAME = \"mock\"\n",
|
||||
"\n",
|
||||
"json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n",
|
||||
"\n",
|
||||
"with open(json_path, \"r\") as file:\n",
|
||||
" embedding_data = json.load(file)\n",
|
||||
"\n",
|
||||
"file_paths = []\n",
|
||||
"embeddings = []\n",
|
||||
"for i, record in enumerate(embedding_data):\n",
|
||||
" file_paths.append(record.get(\"filepath\"))\n",
|
||||
" embeddings.append(record.get(\"embedding\"))\n",
|
||||
"\n",
|
||||
"if DATASET_NAME in fo.list_datasets():\n",
|
||||
" dataset = fo.load_dataset(DATASET_NAME)\n",
|
||||
" dataset.delete()\n",
|
||||
"dataset = fo.Dataset(DATASET_NAME)\n",
|
||||
"\n",
|
||||
"# Add samples to the dataset\n",
|
||||
"samples = [fo.Sample(filepath=p) for p in file_paths]\n",
|
||||
"dataset.add_samples(samples)\n",
|
||||
"\n",
|
||||
"# Building Gaussian mixture model (GMM)\n",
|
||||
"n_gaussians = 50\n",
|
||||
"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
|
||||
"gmm.fit(embeddings)\n",
|
||||
"cluster_labels = gmm.predict(embeddings)\n",
|
||||
"\n",
|
||||
"# Adding labeled embeddings to visulization\n",
|
||||
"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
|
||||
"for sample, label in zip(dataset, cluster_labels):\n",
|
||||
" sample[\"gmm_cluster_50_gaussians\"] = int(label)\n",
|
||||
" sample.save()\n",
|
||||
"\n",
|
||||
"n_gaussians = 200\n",
|
||||
"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
|
||||
"gmm.fit(embeddings)\n",
|
||||
"cluster_labels = gmm.predict(embeddings)\n",
|
||||
"\n",
|
||||
"# Adding labeled embeddings to visulization\n",
|
||||
"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
|
||||
"for sample, label in zip(dataset, cluster_labels):\n",
|
||||
" sample[\"gmm_cluster_200_gaussians\"] = int(label)\n",
|
||||
" sample.save()\n",
|
||||
"\n",
|
||||
"# --- Visualize the Embeddings with UMAP ---\n",
|
||||
"# This will compute a 2D representation of your embeddings\n",
|
||||
"# for visualization.\n",
|
||||
"res = fob.compute_visualization(\n",
|
||||
" dataset,\n",
|
||||
" embeddings=embeddings,\n",
|
||||
" brain_key=\"qwen_vision_viz\",\n",
|
||||
" method=\"tsne\",\n",
|
||||
" verbose=True\n",
|
||||
")\n",
|
||||
"dataset.set_values(\"qwen_umap\", res.current_points)\n",
|
||||
"\n",
|
||||
"print(\"UMAP visualization computed. Launch the app to see the plot.\")\n",
|
||||
"session = fo.launch_app(dataset)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
1301
extract/clustering_example_qwen_Debug.ipynb
Normal file
1301
extract/clustering_example_qwen_Debug.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
90
extract/extract.py
Normal file
90
extract/extract.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import torch
|
||||
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModel
|
||||
# from qwen_vl_utils import process_vision_info
|
||||
from PIL import Image
|
||||
import os
|
||||
import numpy as np
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
from transformers import LayoutLMv3ImageProcessor, LayoutLMv3Model
|
||||
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL_NAME = "microsoft/layoutlmv3-base" # You can choose other model sizes
|
||||
IMAGE_DIR = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/"
|
||||
BATCH_SIZE = 8
|
||||
# --- End Configuration ---
|
||||
|
||||
# Check for GPU availability
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# Load the model and processor
|
||||
# model = AutoModel.from_pretrained(
|
||||
# MODEL_NAME, torch_dtype="bfloat16", device_map="cuda" # , attn_implementation="flash_attention_2",
|
||||
# )
|
||||
|
||||
|
||||
model = LayoutLMv3Model.from_pretrained(MODEL_NAME, device_map="cuda")
|
||||
processor = LayoutLMv3ImageProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
||||
|
||||
|
||||
def get_image_embeddings(image_paths):
|
||||
"""
|
||||
Processes a batch of images and extracts their embeddings.
|
||||
"""
|
||||
images_pil = []
|
||||
valid_paths = []
|
||||
for path in image_paths:
|
||||
if path.lower().endswith(('.png', '.jpg', '.jpeg')):
|
||||
try:
|
||||
# The processor expects PIL images in RGB format
|
||||
images_pil.append(Image.open(path).convert("RGB"))
|
||||
valid_paths.append(path)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load image {path}. Skipping. Error: {e}")
|
||||
|
||||
if not images_pil:
|
||||
return np.array([]), []
|
||||
|
||||
# For pure vision feature extraction, we can provide an empty text prompt.
|
||||
# The processor handles tokenizing text and preparing images.
|
||||
# LayoutLMv3 expects 224x224 images by default
|
||||
inputs = processor(
|
||||
# text=[""] * len(images_pil),
|
||||
images=images_pil,
|
||||
# padding=True,
|
||||
size = {"height" : 224, "width": 224},
|
||||
return_tensors="pt"
|
||||
).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
# Get the vision embeddings from the model's vision tower
|
||||
vision_outputs = model.forward(pixel_values=inputs['pixel_values'].to(dtype=model.dtype)) # , grid_thw=inputs['image_grid_thw'])
|
||||
# We'll use the pooled output as the embedding
|
||||
embeddings = vision_outputs[0][:,0,:]
|
||||
|
||||
return embeddings.to(torch.float16).cpu().numpy()
|
||||
|
||||
|
||||
# --- Process all images in the directory ---
|
||||
image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
|
||||
all_embeddings = []
|
||||
filepaths = []
|
||||
|
||||
with open("embeddings_factures_ostepoathie_1k.json", "w") as f:
|
||||
f.write("[\n")
|
||||
first = True
|
||||
for i in tqdm(range(0, len(image_files), BATCH_SIZE)):
|
||||
batch_paths = image_files[i:i+BATCH_SIZE]
|
||||
batch_embeddings = get_image_embeddings(batch_paths)
|
||||
embeddings_list = [emb.tolist() for emb in batch_embeddings]
|
||||
for path, emb in zip(batch_paths, embeddings_list):
|
||||
if not first:
|
||||
f.write(",\n")
|
||||
json.dump({"filepath": path, "embedding": emb}, f)
|
||||
first = False
|
||||
f.write("\n]\n")
|
||||
|
||||
print("Embeddings extracted and saved.")
|
201
extract/extract_donut.py
Normal file
201
extract/extract_donut.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import torch
|
||||
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
||||
from PIL import Image
|
||||
import os
|
||||
import numpy as np
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL_NAME = "naver-clova-ix/donut-base-finetuned-docvqa" # Donut model for document VQA
|
||||
IMAGE_DIR = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/"
|
||||
BATCH_SIZE = 4 # Smaller batch size for Donut as it's memory intensive
|
||||
# --- End Configuration ---
|
||||
|
||||
# Check for GPU availability
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# Load the Donut model and processor
|
||||
print("Loading Donut model and processor...")
|
||||
processor = DonutProcessor.from_pretrained(MODEL_NAME)
|
||||
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# Set model to half precision for efficiency if using GPU
|
||||
if device == "cuda":
|
||||
model = model.half()
|
||||
|
||||
|
||||
def get_document_embeddings(image_paths):
|
||||
"""
|
||||
Processes a batch of document images and extracts their embeddings using Donut.
|
||||
Uses the encoder part of the VisionEncoderDecoder model to get visual representations.
|
||||
"""
|
||||
images_pil = []
|
||||
valid_paths = []
|
||||
|
||||
for path in image_paths:
|
||||
if path.lower().endswith(('.png', '.jpg', '.jpeg')):
|
||||
try:
|
||||
# Load and convert image to RGB
|
||||
image = Image.open(path).convert("RGB")
|
||||
images_pil.append(image)
|
||||
valid_paths.append(path)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load image {path}. Skipping. Error: {e}")
|
||||
|
||||
if not images_pil:
|
||||
return np.array([]), []
|
||||
|
||||
embeddings_list = []
|
||||
|
||||
# Process images one by one to avoid memory issues
|
||||
for image in images_pil:
|
||||
try:
|
||||
# Preprocess the image
|
||||
pixel_values = processor(image, return_tensors="pt").pixel_values
|
||||
pixel_values = pixel_values.to(device)
|
||||
|
||||
if device == "cuda":
|
||||
pixel_values = pixel_values.half()
|
||||
|
||||
with torch.no_grad():
|
||||
# Get encoder outputs (visual features)
|
||||
encoder_outputs = model.encoder(pixel_values=pixel_values)
|
||||
|
||||
# Use the last hidden state and apply global average pooling
|
||||
# to get a fixed-size representation
|
||||
last_hidden_state = encoder_outputs.last_hidden_state # [batch_size, seq_len, hidden_size]
|
||||
|
||||
# Global average pooling across the sequence dimension
|
||||
embedding = torch.mean(last_hidden_state, dim=1) # [batch_size, hidden_size]
|
||||
|
||||
embeddings_list.append(embedding.squeeze().cpu().float().numpy())
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not process image. Error: {e}")
|
||||
# Add zero embedding for failed images to maintain consistency
|
||||
embeddings_list.append(np.zeros(model.config.encoder.hidden_size))
|
||||
|
||||
return np.array(embeddings_list), valid_paths
|
||||
|
||||
|
||||
def extract_document_info(image_path, question="What information is in this document?"):
|
||||
"""
|
||||
Extract specific information from a document using Donut's text generation capability.
|
||||
This function demonstrates how to use Donut for document understanding tasks.
|
||||
"""
|
||||
try:
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
# Prepare the task prompt for document VQA
|
||||
task_prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"
|
||||
|
||||
# Process the image and prompt
|
||||
inputs = processor(image, task_prompt, return_tensors="pt")
|
||||
inputs = {k: v.to(device) for k, v in inputs.items()}
|
||||
|
||||
if device == "cuda":
|
||||
inputs["pixel_values"] = inputs["pixel_values"].half()
|
||||
|
||||
with torch.no_grad():
|
||||
# Generate answer
|
||||
generated_ids = model.generate(
|
||||
**inputs,
|
||||
max_length=512,
|
||||
early_stopping=True,
|
||||
pad_token_id=processor.tokenizer.pad_token_id,
|
||||
eos_token_id=processor.tokenizer.eos_token_id,
|
||||
use_cache=True,
|
||||
num_beams=1,
|
||||
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
|
||||
# Decode the generated answer
|
||||
decoded_text = processor.batch_decode(generated_ids.sequences)[0]
|
||||
# Extract the answer part
|
||||
answer = decoded_text.split("<s_answer>")[-1].replace("</s_answer>", "").strip()
|
||||
|
||||
return answer
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting info from {image_path}: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
# --- Process all images in the directory ---
|
||||
print("Scanning for image files...")
|
||||
image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR)
|
||||
if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
|
||||
print(f"Found {len(image_files)} image files")
|
||||
|
||||
all_embeddings = []
|
||||
filepaths = []
|
||||
|
||||
# Extract embeddings and save to JSON
|
||||
print("Extracting embeddings using Donut...")
|
||||
with open("embeddings_factures_donut.json", "w") as f:
|
||||
f.write("[\n")
|
||||
first = True
|
||||
|
||||
for i in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
|
||||
batch_paths = image_files[i:i+BATCH_SIZE]
|
||||
batch_embeddings, valid_paths = get_document_embeddings(batch_paths)
|
||||
|
||||
if len(batch_embeddings) > 0:
|
||||
embeddings_list = [emb.tolist() for emb in batch_embeddings]
|
||||
|
||||
for path, emb in zip(valid_paths, embeddings_list):
|
||||
if not first:
|
||||
f.write(",\n")
|
||||
|
||||
entry = {
|
||||
"filepath": path,
|
||||
"embedding": emb,
|
||||
"model": "donut-base-finetuned-docvqa",
|
||||
"embedding_size": len(emb)
|
||||
}
|
||||
|
||||
json.dump(entry, f)
|
||||
first = False
|
||||
|
||||
f.write("\n]\n")
|
||||
|
||||
print("Embeddings extracted and saved to 'embeddings_factures_donut.json'")
|
||||
|
||||
# Optional: Extract some sample document information
|
||||
print("\nExtracting sample document information...")
|
||||
sample_images = image_files[:3] # Process first 3 images as samples
|
||||
|
||||
sample_info = []
|
||||
for img_path in sample_images:
|
||||
print(f"Processing: {os.path.basename(img_path)}")
|
||||
|
||||
# Extract different types of information
|
||||
questions = [
|
||||
"What is the total amount?",
|
||||
"What is the invoice number?",
|
||||
"What is the date?",
|
||||
"Who is the vendor?",
|
||||
"What are the main items?"
|
||||
]
|
||||
|
||||
info = {"filepath": img_path, "extracted_info": {}}
|
||||
|
||||
for question in questions:
|
||||
answer = extract_document_info(img_path, question)
|
||||
info["extracted_info"][question] = answer
|
||||
print(f" {question}: {answer}")
|
||||
|
||||
sample_info.append(info)
|
||||
|
||||
# Save sample extraction results
|
||||
with open("donut_sample_extractions.json", "w") as f:
|
||||
json.dump(sample_info, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("Sample document information extracted and saved to 'donut_sample_extractions.json'")
|
||||
print("Processing completed!")
|
139
extract/test.ipynb
Normal file
139
extract/test.ipynb
Normal file
@@ -0,0 +1,139 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "a314a8ac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[0. , 0. ],\n",
|
||||
" [0.26726124, 0.56694671]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
"import numpy as np\n",
|
||||
"X = [[0, 0, 0], [1, 2, 3]]\n",
|
||||
"Y = [[1, 0, 0], [1, 1, 0]]\n",
|
||||
"cosine_similarity(X, Y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "4b560c4f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sims = cosine_similarity(X)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d8d5d17a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "a1098a5a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(3, 3)\n",
|
||||
"(array([0, 0, 1]), array([1, 2, 2]))\n",
|
||||
"(3,)\n",
|
||||
"mean sim: -0.3333333333333334 std: 0.47140452079103173\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# X = np.array([\n",
|
||||
"# [0, 0, 0], \n",
|
||||
"# [-1, 100, -1000],\n",
|
||||
"# [-1, -2, -4]\n",
|
||||
"# ]\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"X = np.array([\n",
|
||||
" [0, 0, 0], \n",
|
||||
" [1,1,1],\n",
|
||||
" [-1, -1, -1]\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"print(X.shape)\n",
|
||||
"sims = cosine_similarity(X)\n",
|
||||
"\n",
|
||||
"triu_idxs = np.triu_indices_from(sims, k=1)\n",
|
||||
"print(triu_idxs)\n",
|
||||
"dist_vals = sims[triu_idxs]\n",
|
||||
"print(dist_vals.shape)\n",
|
||||
"print(\"mean sim:\", dist_vals.mean(), \"std:\", dist_vals.std())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "2dacad18",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([0.])"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dist_vals"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "76d25e07",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Reference in New Issue
Block a user