Files
embedding-clustering/extract/clustering_example_qwen_Debug.ipynb

1302 lines
64 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "59f8a415",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cuda\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.26it/s]\n"
]
}
],
"source": [
"import torch\n",
"from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel\n",
"# from qwen_vl_utils import process_vision_info\n",
"from PIL import Image\n",
"import os\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"import math\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModel\n",
"import timm\n",
"\n",
"# --- Configuration ---\n",
"MODEL_NAME = \"Qwen/Qwen2.5-VL-3B-Instruct\" # You can choose other model sizes\n",
"IMAGE_DIR = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/\"\n",
"BATCH_SIZE = 4\n",
"# --- End Configuration ---\n",
"\n",
"# Check for GPU availability\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"print(f\"Using device: {device}\")\n",
"\n",
"# Load the model and processor\n",
"\n",
"model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
" MODEL_NAME,\n",
" torch_dtype=torch.bfloat16,\n",
" # use_flash_attn=True,\n",
" attn_implementation=\"flash_attention_2\",\n",
" trust_remote_code=True,\n",
" device_map=\"cuda\").eval()\n",
"\n",
"processor = AutoProcessor.from_pretrained(\n",
" MODEL_NAME, \n",
" trust_remote_code=True\n",
" )\n",
"tokenizer = AutoTokenizer.from_pretrained(\n",
" MODEL_NAME, \n",
" trust_remote_code=True, \n",
" use_fast=False\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "69e7b24e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2_5_VLForConditionalGeneration(\n",
" (visual): Qwen2_5_VisionTransformerPretrainedModel(\n",
" (patch_embed): Qwen2_5_VisionPatchEmbed(\n",
" (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)\n",
" )\n",
" (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()\n",
" (blocks): ModuleList(\n",
" (0-31): 32 x Qwen2_5_VLVisionBlock(\n",
" (norm1): Qwen2RMSNorm((1280,), eps=1e-06)\n",
" (norm2): Qwen2RMSNorm((1280,), eps=1e-06)\n",
" (attn): Qwen2_5_VLVisionFlashAttention2(\n",
" (qkv): Linear(in_features=1280, out_features=3840, bias=True)\n",
" (proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
" )\n",
" (mlp): Qwen2_5_VLMLP(\n",
" (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)\n",
" (up_proj): Linear(in_features=1280, out_features=3420, bias=True)\n",
" (down_proj): Linear(in_features=3420, out_features=1280, bias=True)\n",
" (act_fn): SiLU()\n",
" )\n",
" )\n",
" )\n",
" (merger): Qwen2_5_VLPatchMerger(\n",
" (ln_q): Qwen2RMSNorm((1280,), eps=1e-06)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=5120, out_features=5120, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=5120, out_features=2048, bias=True)\n",
" )\n",
" )\n",
" )\n",
" (model): Qwen2_5_VLModel(\n",
" (embed_tokens): Embedding(151936, 2048)\n",
" (layers): ModuleList(\n",
" (0-35): 36 x Qwen2_5_VLDecoderLayer(\n",
" (self_attn): Qwen2_5_VLFlashAttention2(\n",
" (q_proj): Linear(in_features=2048, out_features=2048, bias=True)\n",
" (k_proj): Linear(in_features=2048, out_features=256, bias=True)\n",
" (v_proj): Linear(in_features=2048, out_features=256, bias=True)\n",
" (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
" (rotary_emb): Qwen2_5_VLRotaryEmbedding()\n",
" )\n",
" (mlp): Qwen2MLP(\n",
" (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)\n",
" (up_proj): Linear(in_features=2048, out_features=11008, bias=False)\n",
" (down_proj): Linear(in_features=11008, out_features=2048, bias=False)\n",
" (act_fn): SiLU()\n",
" )\n",
" (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)\n",
" (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)\n",
" )\n",
" )\n",
" (norm): Qwen2RMSNorm((2048,), eps=1e-06)\n",
" (rotary_emb): Qwen2_5_VLRotaryEmbedding()\n",
" )\n",
" (lm_head): Linear(in_features=2048, out_features=151936, bias=False)\n",
")"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0865539d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2_5_VLProcessor:\n",
"- image_processor: Qwen2VLImageProcessor {\n",
" \"do_convert_rgb\": true,\n",
" \"do_normalize\": true,\n",
" \"do_rescale\": true,\n",
" \"do_resize\": true,\n",
" \"image_mean\": [\n",
" 0.48145466,\n",
" 0.4578275,\n",
" 0.40821073\n",
" ],\n",
" \"image_processor_type\": \"Qwen2VLImageProcessor\",\n",
" \"image_std\": [\n",
" 0.26862954,\n",
" 0.26130258,\n",
" 0.27577711\n",
" ],\n",
" \"max_pixels\": 12845056,\n",
" \"merge_size\": 2,\n",
" \"min_pixels\": 3136,\n",
" \"patch_size\": 14,\n",
" \"processor_class\": \"Qwen2_5_VLProcessor\",\n",
" \"resample\": 3,\n",
" \"rescale_factor\": 0.00392156862745098,\n",
" \"size\": {\n",
" \"longest_edge\": 12845056,\n",
" \"shortest_edge\": 3136\n",
" },\n",
" \"temporal_patch_size\": 2\n",
"}\n",
"\n",
"- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
"\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151657: AddedToken(\"<tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151658: AddedToken(\"</tool_call>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n",
"}\n",
")\n",
"\n",
"{\n",
" \"processor_class\": \"Qwen2_5_VLProcessor\"\n",
"}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processor"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "abf32acc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2_5_VisionTransformerPretrainedModel(\n",
" (patch_embed): Qwen2_5_VisionPatchEmbed(\n",
" (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)\n",
" )\n",
" (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()\n",
" (blocks): ModuleList(\n",
" (0-31): 32 x Qwen2_5_VLVisionBlock(\n",
" (norm1): Qwen2RMSNorm((1280,), eps=1e-06)\n",
" (norm2): Qwen2RMSNorm((1280,), eps=1e-06)\n",
" (attn): Qwen2_5_VLVisionFlashAttention2(\n",
" (qkv): Linear(in_features=1280, out_features=3840, bias=True)\n",
" (proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
" )\n",
" (mlp): Qwen2_5_VLMLP(\n",
" (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)\n",
" (up_proj): Linear(in_features=1280, out_features=3420, bias=True)\n",
" (down_proj): Linear(in_features=3420, out_features=1280, bias=True)\n",
" (act_fn): SiLU()\n",
" )\n",
" )\n",
" )\n",
" (merger): Qwen2_5_VLPatchMerger(\n",
" (ln_q): Qwen2RMSNorm((1280,), eps=1e-06)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=5120, out_features=5120, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=5120, out_features=2048, bias=True)\n",
" )\n",
" )\n",
")"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.visual"
]
},
{
"cell_type": "markdown",
"id": "8037ae45",
"metadata": {},
"source": [
"# preprocess"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d41f94bd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "f2ec71a4",
"metadata": {},
"source": [
"# Attention pooling\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a404fa19",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn.functional as F\n",
"\n",
"def gem_pool(x, p: float = 3.0, eps: float = 1e-6):\n",
" # x: [B, N, D]\n",
" return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)\n",
"\n",
"@torch.no_grad()\n",
"def image_embedding(pixel_values, model, use_tiling=True):\n",
" # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]\n",
" out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)\n",
" tok = out.last_hidden_state # [T, N, 1024] hoặc [1, N, 1024]\n",
" if tok.dim() == 2: # phòng trường hợp model trả [N, D]\n",
" tok = tok.unsqueeze(0)\n",
"\n",
" # 1) Attention pooling theo token, trong từng tile\n",
" w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1) # [T,N,1]\n",
" attn_tile = (tok * w_tok).sum(dim=1) # [T,1024]\n",
"\n",
" # 2) Các pooling khác theo token\n",
" mean_tile = tok.mean(dim=1) # [T,1024]\n",
" max_tile = tok.max(dim=1).values # [T,1024]\n",
" gem_tile = gem_pool(tok, p=3.0) # [T,1024]\n",
"\n",
" # 3) Attention across-tiles (giữ multi-scale nhưng gọn)\n",
" tile_scores = attn_tile.norm(dim=-1) # [T]\n",
" w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1) # [T,1]\n",
"\n",
" mean_vec = (mean_tile * w_tile).sum(dim=0)\n",
" max_vec = (max_tile * w_tile).sum(dim=0)\n",
" gem_vec = (gem_tile * w_tile).sum(dim=0)\n",
" attn_vec = (attn_tile * w_tile).sum(dim=0)\n",
"\n",
" # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin\n",
" one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0) # [4*1024]\n",
" one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0) # [1, 4096]\n",
" return one_vec.half() # FP16 để tiết kiệm bộ nhớ"
]
},
{
"cell_type": "markdown",
"id": "ed35a4ce",
"metadata": {},
"source": [
"# pool"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3edf8b67",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn.functional as F\n",
"\n",
"# --- Pooling theo token (trong 1 tile) ---\n",
"def _pool_tokens(tokens: torch.Tensor, how: str = \"mean\") -> torch.Tensor:\n",
" \"\"\"\n",
" tokens: [1, N, D] hoặc [N, D]\n",
" return: [D]\n",
" \"\"\"\n",
" if tokens.dim() == 3: # [1, N, D] -> [N, D]\n",
" tokens = tokens.squeeze(0)\n",
"\n",
" if how == \"mean\":\n",
" v = tokens.mean(dim=0)\n",
" elif how == \"max\":\n",
" v = tokens.max(dim=0).values\n",
" elif how == \"gem\":\n",
" p = 3.0\n",
" v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)\n",
" elif how == \"cls\":\n",
" # chỉ dùng nếu backbone có CLS token ở vị trí đầu\n",
" v = tokens[0]\n",
" else:\n",
" raise ValueError(f\"Unknown pooling: {how}\")\n",
"\n",
" return v\n",
"\n",
"\n",
"@torch.no_grad()\n",
"def image_embedding_global(model, pixel_values: torch.Tensor,\n",
" pool: str = \"mean\",\n",
" normalize: bool = False,\n",
" global_index: int = 0,\n",
" use_projector: bool = False) -> torch.Tensor:\n",
" \"\"\"\n",
" Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.\n",
" - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]\n",
" - global_index: thường = 0 (tile toàn ảnh nằm đầu)\n",
" - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)\n",
" \"\"\"\n",
" model.eval()\n",
" device = next(model.parameters()).device\n",
" x = pixel_values.to(device)\n",
"\n",
" out = model.vision_model(pixel_values=x) # last_hidden_state: [T, N, D] hoặc [1, N, D]\n",
" tok = out.last_hidden_state\n",
"\n",
" # chọn global tile\n",
" if tok.size(0) > 1:\n",
" tok = tok[global_index:global_index+1] # [1, N, D]\n",
"\n",
" # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!\n",
" if use_projector:\n",
" # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)\n",
" in_feat = getattr(model.mlp1[1], \"in_features\", None)\n",
" if in_feat is not None and tok.size(-1) == in_feat:\n",
" tok = model.mlp1(tok) # [1, N, D]\n",
" else:\n",
" raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n",
"\n",
" v = _pool_tokens(tok, how=pool) # [D]\n",
" if normalize:\n",
" v = F.normalize(v, dim=-1)\n",
" return v.unsqueeze(0) # [1, D]\n",
"\n",
"\n",
"@torch.no_grad()\n",
"def image_embedding_mean(model, pixel_values: torch.Tensor,\n",
" pool: str = \"mean\",\n",
" normalize: bool = True,\n",
" use_projector: bool = False) -> torch.Tensor:\n",
" \"\"\"\n",
" Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:\n",
" (1) pool theo token trong từng tile → [T, D]\n",
" (2) lấy mean across-tiles → [D]\n",
" \"\"\"\n",
" model.eval()\n",
" device = next(model.parameters()).device\n",
" x = pixel_values.to(device)\n",
"\n",
" out = model.vision_model(pixel_values=x)\n",
" tok = out.last_hidden_state # [T, N, D] hoặc [1, N, D]\n",
"\n",
" if use_projector:\n",
" in_feat = getattr(model.mlp1[1], \"in_features\", None)\n",
" if in_feat is not None and tok.size(-1) == in_feat:\n",
" tok = model.mlp1(tok)\n",
" else:\n",
" raise ValueError(f\"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}\")\n",
"\n",
" # pool theo token trong từng tile\n",
" T = tok.size(0)\n",
" per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ] # list of [D]\n",
" per_tile = torch.stack(per_tile, dim=0) # [T, D]\n",
"\n",
" # mean across-tiles\n",
" v = per_tile.mean(dim=0) # [D]\n",
" if normalize:\n",
" v = F.normalize(v, dim=-1)\n",
" return v.unsqueeze(0) # [1, D]\n"
]
},
{
"cell_type": "markdown",
"id": "613cf001",
"metadata": {},
"source": [
"# infer"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "cdfdab0e",
"metadata": {},
"outputs": [],
"source": [
"def get_image_embedding(path):\n",
" \"\"\"\n",
" Processes a batch of images and extracts their embeddings.\n",
" \"\"\"\n",
" images_pil = []\n",
" valid_paths = []\n",
" if path.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
" try:\n",
" # The processor expects PIL images in RGB format\n",
" images_pil.append(Image.open(path).convert(\"RGB\"))\n",
" # print(path)\n",
" valid_paths.append(path)\n",
" except Exception as e:\n",
" print(f\"Warning: Could not load image {path}. Skipping. Error: {e}\")\n",
"\n",
" if not valid_paths:\n",
" return np.array([]), []\n",
"\n",
" inputs = processor(\n",
" text=[\"\"] * len(images_pil),\n",
" images=images_pil,\n",
" padding=True,\n",
" return_tensors=\"pt\"\n",
" ).to(device)\n",
" \n",
" # embeddings = image_embedding(inputs, model, use_tiling=True)\n",
" embeddings = image_embedding_mean(model, inputs)\n",
" \n",
" return embeddings.to(torch.float16).cpu().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "cdaebb7b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/2800 [00:03<?, ?it/s]\n"
]
},
{
"ename": "AttributeError",
"evalue": "'Qwen2_5_VLForConditionalGeneration' object has no attribute 'vision_model'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[15], line 15\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m tqdm(\u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;28mlen\u001b[39m(image_files), BATCH_SIZE)):\n\u001b[1;32m 14\u001b[0m batch_paths \u001b[38;5;241m=\u001b[39m image_files[i]\n\u001b[0;32m---> 15\u001b[0m batch_embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mget_image_embedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_paths\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 16\u001b[0m embeddings_list \u001b[38;5;241m=\u001b[39m [emb\u001b[38;5;241m.\u001b[39mtolist() \u001b[38;5;28;01mfor\u001b[39;00m emb \u001b[38;5;129;01min\u001b[39;00m batch_embeddings]\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m path, emb \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(batch_paths, embeddings_list):\n",
"Cell \u001b[0;32mIn[14], line 27\u001b[0m, in \u001b[0;36mget_image_embedding\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 19\u001b[0m inputs \u001b[38;5;241m=\u001b[39m processor(\n\u001b[1;32m 20\u001b[0m text\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mlen\u001b[39m(images_pil),\n\u001b[1;32m 21\u001b[0m images\u001b[38;5;241m=\u001b[39mimages_pil,\n\u001b[1;32m 22\u001b[0m padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 23\u001b[0m return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 24\u001b[0m )\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# embeddings = image_embedding(inputs, model, use_tiling=True)\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mimage_embedding_mean\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mfloat16)\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n",
"File \u001b[0;32m~/sonnh/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[13], line 81\u001b[0m, in \u001b[0;36mimage_embedding_mean\u001b[0;34m(model, pixel_values, pool, normalize, use_projector)\u001b[0m\n\u001b[1;32m 78\u001b[0m device \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(model\u001b[38;5;241m.\u001b[39mparameters())\u001b[38;5;241m.\u001b[39mdevice\n\u001b[1;32m 79\u001b[0m x \u001b[38;5;241m=\u001b[39m pixel_values\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 81\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvision_model\u001b[49m(pixel_values\u001b[38;5;241m=\u001b[39mx)\n\u001b[1;32m 82\u001b[0m tok \u001b[38;5;241m=\u001b[39m out\u001b[38;5;241m.\u001b[39mlast_hidden_state \u001b[38;5;66;03m# [T, N, D] hoặc [1, N, D]\u001b[39;00m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_projector:\n",
"File \u001b[0;32m~/sonnh/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1940\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1938\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1939\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1940\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m 1941\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1942\u001b[0m )\n",
"\u001b[0;31mAttributeError\u001b[0m: 'Qwen2_5_VLForConditionalGeneration' object has no attribute 'vision_model'"
]
}
],
"source": [
"import json\n",
"\n",
"# --- Process all images in the directory ---\n",
"image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
"all_embeddings = []\n",
"filepaths = []\n",
"BATCH_SIZE = 1\n",
"\n",
"with open(\"embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json\", \"w\") as f:\n",
"\n",
" f.write(\"[\\n\")\n",
" first = True\n",
" for i in tqdm(range(0, len(image_files), BATCH_SIZE)):\n",
" batch_paths = image_files[i]\n",
" batch_embeddings = get_image_embedding(batch_paths)\n",
" embeddings_list = [emb.tolist() for emb in batch_embeddings]\n",
" for path, emb in zip(batch_paths, embeddings_list):\n",
" if not first:\n",
" f.write(\",\\n\")\n",
" json.dump({\"filepath\": path, \"embedding\": emb}, f)\n",
" first = False\n",
" f.write(\"\\n]\\n\")\n",
"\n",
"print(\"Embeddings extracted and saved.\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2c3e6dd0",
"metadata": {},
"outputs": [],
"source": [
"image_path = \"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "29620d93",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"User: <image>\n",
"Please describe the image shortly.\n",
"Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address \"104, cours des fossés, 33210 Langon\" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.\n",
"User: <image>\n",
"Please describe the image in detail.\n",
"Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:\n",
"\n",
"- **Name:** Noël Breignaud\n",
"- **Profession:** Ouestopathe (Osteopath)\n",
"- **Address:** 104, cours des fossés, 33210 Lagnon\n",
"- **Phone Number:** Tel. 06 88 70 66 43\n",
"\n",
"On the right side, there are registration and identification numbers:\n",
"\n",
"- **Nº SIRET:** 510 123 631 00010\n",
"- **Nº ADELI:** 330001108\n",
"- **Code APE:** 8690E\n",
"\n",
"The handwritten section of the document is in French and reads:\n",
"\n",
"- \"Déclaire avoir reçu de M. M. (fils) G[obon], Acquitté la somme de 55 €\n",
"Pour 1 consultation en date du 05/04/2024\n",
"N°: 1750460-19212774\"\n",
"\n",
"At the bottom right, there is a signature that appears to be of Noël Breignaud, with a red stamp partially visible, which seems to contain the text \"Noël BREIGNAUD\" and other markings.\n",
"\n",
"The date in the handwritten section is \"05/04/2024,\" indicating the receipt or service provided on that date. The amount mentioned is 55 euros for one consultation.\n"
]
}
],
"source": [
"pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()\n",
"generation_config = dict(max_new_tokens=1024, do_sample=True)\n",
"\n",
"\n",
"\n",
"question = '<image>\\nPlease describe the image shortly.'\n",
"response = model.chat(tokenizer, pixel_values, question, generation_config)\n",
"print(f'User: {question}\\nAssistant: {response}')\n",
"\n",
"# single-image multi-round conversation (单图多轮对话)\n",
"question = '<image>\\nPlease describe the image in detail.'\n",
"response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)\n",
"print(f'User: {question}\\nAssistant: {response}')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "35dc90e0",
"metadata": {},
"outputs": [],
"source": [
"vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77f3720a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([7, 1025, 1024])\n"
]
}
],
"source": [
"patch_feats = vout.last_hidden_state # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn\n",
"print(patch_feats.shape)\n",
"# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]\n",
"# Cách an toàn chung: mean-pool\n",
"# img_vec = patch_feats.mean(dim=1) # [B, Dv]\n",
"# img_vec = torch.nn.functional.normalize(img_vec, dim=-1) # L2 normalize cho retrieval"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0043634c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([7, 1024])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# img_vec.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92032162",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ede95852",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"InternVLChatModel(\n",
" (vision_model): InternVisionModel(\n",
" (embeddings): InternVisionEmbeddings(\n",
" (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))\n",
" )\n",
" (encoder): InternVisionEncoder(\n",
" (layers): ModuleList(\n",
" (0): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): Identity()\n",
" (drop_path2): Identity()\n",
" )\n",
" (1): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.004)\n",
" (drop_path2): DropPath(drop_prob=0.004)\n",
" )\n",
" (2): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.009)\n",
" (drop_path2): DropPath(drop_prob=0.009)\n",
" )\n",
" (3): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.013)\n",
" (drop_path2): DropPath(drop_prob=0.013)\n",
" )\n",
" (4): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.017)\n",
" (drop_path2): DropPath(drop_prob=0.017)\n",
" )\n",
" (5): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.022)\n",
" (drop_path2): DropPath(drop_prob=0.022)\n",
" )\n",
" (6): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.026)\n",
" (drop_path2): DropPath(drop_prob=0.026)\n",
" )\n",
" (7): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.031)\n",
" (drop_path2): DropPath(drop_prob=0.031)\n",
" )\n",
" (8): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.035)\n",
" (drop_path2): DropPath(drop_prob=0.035)\n",
" )\n",
" (9): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.039)\n",
" (drop_path2): DropPath(drop_prob=0.039)\n",
" )\n",
" (10): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.044)\n",
" (drop_path2): DropPath(drop_prob=0.044)\n",
" )\n",
" (11): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.048)\n",
" (drop_path2): DropPath(drop_prob=0.048)\n",
" )\n",
" (12): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.052)\n",
" (drop_path2): DropPath(drop_prob=0.052)\n",
" )\n",
" (13): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.056)\n",
" (drop_path2): DropPath(drop_prob=0.056)\n",
" )\n",
" (14): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.061)\n",
" (drop_path2): DropPath(drop_prob=0.061)\n",
" )\n",
" (15): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.065)\n",
" (drop_path2): DropPath(drop_prob=0.065)\n",
" )\n",
" (16): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.069)\n",
" (drop_path2): DropPath(drop_prob=0.069)\n",
" )\n",
" (17): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.074)\n",
" (drop_path2): DropPath(drop_prob=0.074)\n",
" )\n",
" (18): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.078)\n",
" (drop_path2): DropPath(drop_prob=0.078)\n",
" )\n",
" (19): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.083)\n",
" (drop_path2): DropPath(drop_prob=0.083)\n",
" )\n",
" (20): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.087)\n",
" (drop_path2): DropPath(drop_prob=0.087)\n",
" )\n",
" (21): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.091)\n",
" (drop_path2): DropPath(drop_prob=0.091)\n",
" )\n",
" (22): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.096)\n",
" (drop_path2): DropPath(drop_prob=0.096)\n",
" )\n",
" (23): InternVisionEncoderLayer(\n",
" (attn): InternAttention(\n",
" (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
" (attn_drop): Dropout(p=0.0, inplace=False)\n",
" (proj_drop): Dropout(p=0.0, inplace=False)\n",
" (inner_attn): FlashAttention()\n",
" (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (mlp): InternMLP(\n",
" (act): GELUActivation()\n",
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
" (drop_path1): DropPath(drop_prob=0.100)\n",
" (drop_path2): DropPath(drop_prob=0.100)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (language_model): Qwen3ForCausalLM(\n",
" (model): Qwen3Model(\n",
" (embed_tokens): Embedding(151936, 2560)\n",
" (layers): ModuleList(\n",
" (0-35): 36 x Qwen3DecoderLayer(\n",
" (self_attn): Qwen3Attention(\n",
" (q_proj): Linear(in_features=2560, out_features=4096, bias=False)\n",
" (k_proj): Linear(in_features=2560, out_features=1024, bias=False)\n",
" (v_proj): Linear(in_features=2560, out_features=1024, bias=False)\n",
" (o_proj): Linear(in_features=4096, out_features=2560, bias=False)\n",
" (q_norm): Qwen3RMSNorm((128,), eps=1e-06)\n",
" (k_norm): Qwen3RMSNorm((128,), eps=1e-06)\n",
" )\n",
" (mlp): Qwen3MLP(\n",
" (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)\n",
" (up_proj): Linear(in_features=2560, out_features=9728, bias=False)\n",
" (down_proj): Linear(in_features=9728, out_features=2560, bias=False)\n",
" (act_fn): SiLU()\n",
" )\n",
" (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
" (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
" )\n",
" )\n",
" (norm): Qwen3RMSNorm((2560,), eps=1e-06)\n",
" (rotary_emb): Qwen3RotaryEmbedding()\n",
" )\n",
" (lm_head): Linear(in_features=2560, out_features=151936, bias=False)\n",
" )\n",
" (mlp1): Sequential(\n",
" (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
" (1): Linear(in_features=4096, out_features=2560, bias=True)\n",
" (2): GELU(approximate='none')\n",
" (3): Linear(in_features=2560, out_features=2560, bias=True)\n",
" )\n",
")"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27fea4f3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s] \n"
]
}
],
"source": [
"import fiftyone as fo\n",
"import fiftyone.brain as fob\n",
"import numpy as np\n",
"from sklearn.mixture import GaussianMixture\n",
"import json\n",
"\n",
"DATASET_NAME = \"mock\"\n",
"\n",
"json_path = \"./embeddings_factures_osteopathie_1k_qwen.json\"\n",
"\n",
"with open(json_path, \"r\") as file:\n",
" embedding_data = json.load(file)\n",
"\n",
"file_paths = []\n",
"embeddings = []\n",
"for i, record in enumerate(embedding_data):\n",
" file_paths.append(record.get(\"filepath\"))\n",
" embeddings.append(record.get(\"embedding\"))\n",
"\n",
"if DATASET_NAME in fo.list_datasets():\n",
" dataset = fo.load_dataset(DATASET_NAME)\n",
" dataset.delete()\n",
"dataset = fo.Dataset(DATASET_NAME)\n",
"\n",
"# Add samples to the dataset\n",
"samples = [fo.Sample(filepath=p) for p in file_paths]\n",
"dataset.add_samples(samples)\n",
"\n",
"# Building Gaussian mixture model (GMM)\n",
"n_gaussians = 50\n",
"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
"gmm.fit(embeddings)\n",
"cluster_labels = gmm.predict(embeddings)\n",
"\n",
"# Adding labeled embeddings to visulization\n",
"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
"for sample, label in zip(dataset, cluster_labels):\n",
" sample[\"gmm_cluster_50_gaussians\"] = int(label)\n",
" sample.save()\n",
"\n",
"n_gaussians = 200\n",
"gmm = GaussianMixture(n_components=n_gaussians, random_state=42)\n",
"gmm.fit(embeddings)\n",
"cluster_labels = gmm.predict(embeddings)\n",
"\n",
"# Adding labeled embeddings to visulization\n",
"dataset.add_sample_field(\"gmm_cluster\", fo.IntField)\n",
"for sample, label in zip(dataset, cluster_labels):\n",
" sample[\"gmm_cluster_200_gaussians\"] = int(label)\n",
" sample.save()\n",
"\n",
"# --- Visualize the Embeddings with UMAP ---\n",
"# This will compute a 2D representation of your embeddings\n",
"# for visualization.\n",
"res = fob.compute_visualization(\n",
" dataset,\n",
" embeddings=embeddings,\n",
" brain_key=\"qwen_vision_viz\",\n",
" method=\"tsne\",\n",
" verbose=True\n",
")\n",
"dataset.set_values(\"qwen_umap\", res.current_points)\n",
"\n",
"print(\"UMAP visualization computed. Launch the app to see the plot.\")\n",
"session = fo.launch_app(dataset)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}