In [10]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModel
# from qwen_vl_utils import process_vision_info
from PIL import Image
import os
import numpy as np
from tqdm import tqdm
import math
import torch
from transformers import AutoTokenizer, AutoModel
import timm

# --- Configuration ---
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"  # You can choose other model sizes
IMAGE_DIR = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/"
BATCH_SIZE = 4
# --- End Configuration ---

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the model and processor

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    # use_flash_attn=True,
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
    device_map="cuda").eval()

processor = AutoProcessor.from_pretrained(
    MODEL_NAME, 
    trust_remote_code=True
    )
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    trust_remote_code=True, 
    use_fast=False
    )

Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.26it/s]


In [11]:
model

Qwen2_5_VLForConditionalGeneration(
  (visual): Qwen2_5_VisionTransformerPretrainedModel(
    (patch_embed): Qwen2_5_VisionPatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2_5_VLVisionBlock(
        (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
        (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
        (attn): Qwen2_5_VLVisionFlashAttention2(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): Qwen2_5_VLMLP(
          (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
          (act_fn): SiLU()
        )
      )
    )
    (merger): Qwen2_5_VLPatchMerg

In [3]:
processor

Qwen2_5_VLProcessor:
- image_processor: Qwen2VLImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Qwen2VLImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "max_pixels": 12845056,
  "merge_size": 2,
  "min_pixels": 3136,
  "patch_size": 14,
  "processor_class": "Qwen2_5_VLProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 12845056,
    "shortest_edge": 3136
  },
  "temporal_patch_size": 2
}

- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<

In [16]:
model.visual

Qwen2_5_VisionTransformerPretrainedModel(
  (patch_embed): Qwen2_5_VisionPatchEmbed(
    (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
  )
  (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
  (blocks): ModuleList(
    (0-31): 32 x Qwen2_5_VLVisionBlock(
      (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
      (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
      (attn): Qwen2_5_VLVisionFlashAttention2(
        (qkv): Linear(in_features=1280, out_features=3840, bias=True)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (mlp): Qwen2_5_VLMLP(
        (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
        (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
        (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
        (act_fn): SiLU()
      )
    )
  )
  (merger): Qwen2_5_VLPatchMerger(
    (ln_q): Qwen2RMSNorm((1280,), eps=1e-06)
    (mlp): Sequential(
      (0): Linear(

# preprocess

# Attention pooling


In [12]:
import torch
import torch.nn.functional as F

def gem_pool(x, p: float = 3.0, eps: float = 1e-6):
    # x: [B, N, D]
    return (x.clamp(min=eps).pow(p).mean(dim=1)).pow(1.0/p)

@torch.no_grad()
def image_embedding(pixel_values, model, use_tiling=True):
    # pixel_values: nếu dùng processor của InternVL, có thể là [T,3,H,W]; nếu bạn tự resize = [1,3,H,W]
    out = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)
    tok = out.last_hidden_state            # [T, N, 1024] hoặc [1, N, 1024]
    if tok.dim() == 2:                     # phòng trường hợp model trả [N, D]
        tok = tok.unsqueeze(0)

    # 1) Attention pooling theo token, trong từng tile
    w_tok = torch.softmax(tok.norm(dim=-1), dim=1).unsqueeze(-1)   # [T,N,1]
    attn_tile = (tok * w_tok).sum(dim=1)                           # [T,1024]

    # 2) Các pooling khác theo token
    mean_tile = tok.mean(dim=1)                                    # [T,1024]
    max_tile  = tok.max(dim=1).values                              # [T,1024]
    gem_tile  = gem_pool(tok, p=3.0)                               # [T,1024]

    # 3) Attention across-tiles (giữ multi-scale nhưng gọn)
    tile_scores = attn_tile.norm(dim=-1)                           # [T]
    w_tile = torch.softmax(tile_scores, dim=0).unsqueeze(-1)       # [T,1]

    mean_vec = (mean_tile * w_tile).sum(dim=0)
    max_vec  = (max_tile  * w_tile).sum(dim=0)
    gem_vec  = (gem_tile  * w_tile).sum(dim=0)
    attn_vec = (attn_tile * w_tile).sum(dim=0)

    # 4) Hợp nhất nhiều “góc nhìn” → 1 vector giàu thông tin
    one_vec = torch.cat([mean_vec, max_vec, gem_vec, attn_vec], dim=0)  # [4*1024]
    one_vec = F.normalize(one_vec, dim=-1).unsqueeze(0)                  # [1, 4096]
    return one_vec.half()  # FP16 để tiết kiệm bộ nhớ

# pool

In [13]:
import torch
import torch.nn.functional as F

# --- Pooling theo token (trong 1 tile) ---
def _pool_tokens(tokens: torch.Tensor, how: str = "mean") -> torch.Tensor:
    """
    tokens: [1, N, D] hoặc [N, D]
    return: [D]
    """
    if tokens.dim() == 3:   # [1, N, D] -> [N, D]
        tokens = tokens.squeeze(0)

    if how == "mean":
        v = tokens.mean(dim=0)
    elif how == "max":
        v = tokens.max(dim=0).values
    elif how == "gem":
        p = 3.0
        v = (tokens.clamp(min=1e-6).pow(p).mean(dim=0)).pow(1.0/p)
    elif how == "cls":
        # chỉ dùng nếu backbone có CLS token ở vị trí đầu
        v = tokens[0]
    else:
        raise ValueError(f"Unknown pooling: {how}")

    return v


@torch.no_grad()
def image_embedding_global(model, pixel_values: torch.Tensor,
                           pool: str = "mean",
                           normalize: bool = False,
                           global_index: int = 0,
                           use_projector: bool = False) -> torch.Tensor:
    """
    Trả về 1 vector [1, D] mô tả toàn ảnh, chỉ dùng GLOBAL tile.
    - pixel_values: [T,3,H,W] (ví dụ T=7) hoặc [1,3,H,W]
    - global_index: thường = 0 (tile toàn ảnh nằm đầu)
    - use_projector: CHỈ bật nếu bạn chắc chắn chiều khớp với projector (mlp1)
    """
    model.eval()
    device = next(model.parameters()).device
    x = pixel_values.to(device)

    out = model.vision_model(pixel_values=x)     # last_hidden_state: [T, N, D] hoặc [1, N, D]
    tok = out.last_hidden_state

    # chọn global tile
    if tok.size(0) > 1:
        tok = tok[global_index:global_index+1]   # [1, N, D]

    # (tuỳ chọn) projector sang không gian khác - cẩn thận mismatch chiều!
    if use_projector:
        # CHỈ nên bật khi biết chắc input dim của mlp1 khớp với tok.size(-1)
        in_feat = getattr(model.mlp1[1], "in_features", None)
        if in_feat is not None and tok.size(-1) == in_feat:
            tok = model.mlp1(tok)                # [1, N, D’]
        else:
            raise ValueError(f"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}")

    v = _pool_tokens(tok, how=pool)              # [D]
    if normalize:
        v = F.normalize(v, dim=-1)
    return v.unsqueeze(0)                        # [1, D]


@torch.no_grad()
def image_embedding_mean(model, pixel_values: torch.Tensor,
                         pool: str = "mean",
                         normalize: bool = True,
                         use_projector: bool = False) -> torch.Tensor:
    """
    Trả về 1 vector [1, D] mô tả toàn ảnh, bằng cách:
      (1) pool theo token trong từng tile → [T, D]
      (2) lấy mean across-tiles → [D]
    """
    model.eval()
    device = next(model.parameters()).device
    x = pixel_values.to(device)

    out = model.vision_model(pixel_values=x)
    tok = out.last_hidden_state                  # [T, N, D] hoặc [1, N, D]

    if use_projector:
        in_feat = getattr(model.mlp1[1], "in_features", None)
        if in_feat is not None and tok.size(-1) == in_feat:
            tok = model.mlp1(tok)
        else:
            raise ValueError(f"Projector input dim mismatch: tokens={tok.size(-1)} vs mlp1.in={in_feat}")

    # pool theo token trong từng tile
    T = tok.size(0)
    per_tile = [ _pool_tokens(tok[t:t+1], how=pool) for t in range(T) ]  # list of [D]
    per_tile = torch.stack(per_tile, dim=0)       # [T, D]

    # mean across-tiles
    v = per_tile.mean(dim=0)                      # [D]
    if normalize:
        v = F.normalize(v, dim=-1)
    return v.unsqueeze(0)                         # [1, D]


# infer

In [14]:
def get_image_embedding(path):
    """
    Processes a batch of images and extracts their embeddings.
    """
    images_pil = []
    valid_paths = []
    if path.lower().endswith(('.png', '.jpg', '.jpeg')):
        try:
            # The processor expects PIL images in RGB format
            images_pil.append(Image.open(path).convert("RGB"))
            # print(path)
            valid_paths.append(path)
        except Exception as e:
            print(f"Warning: Could not load image {path}. Skipping. Error: {e}")

    if not valid_paths:
        return np.array([]), []

    inputs = processor(
        text=[""] * len(images_pil),
        images=images_pil,
        padding=True,
        return_tensors="pt"
    ).to(device)
    
    # embeddings = image_embedding(inputs, model, use_tiling=True)
    embeddings = image_embedding_mean(model, inputs)
    
    return embeddings.to(torch.float16).cpu().numpy()

In [15]:
import json

# --- Process all images in the directory ---
image_files = [os.path.join(IMAGE_DIR, f) for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
all_embeddings = []
filepaths = []
BATCH_SIZE = 1

with open("embeddings_factures_osteopathie_1k_InternVL3_5-4B-Instruct.json", "w") as f:

    f.write("[\n")
    first = True
    for i in tqdm(range(0, len(image_files), BATCH_SIZE)):
        batch_paths = image_files[i]
        batch_embeddings = get_image_embedding(batch_paths)
        embeddings_list = [emb.tolist() for emb in batch_embeddings]
        for path, emb in zip(batch_paths, embeddings_list):
            if not first:
                f.write(",\n")
            json.dump({"filepath": path, "embedding": emb}, f)
            first = False
    f.write("\n]\n")

print("Embeddings extracted and saved.")

  0%|          | 0/2800 [00:03<?, ?it/s]


AttributeError: 'Qwen2_5_VLForConditionalGeneration' object has no attribute 'vision_model'

In [9]:
image_path = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_0/c363e486-5d45-425e-aef9-4791cad120f7_20250213_120759_1_scale_1.0.jpg"

In [11]:
pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)



question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')

# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Please describe the image shortly.
Assistant: The image shows a receipt for a consultation with Noël Breignaud, an osteopath. It includes his contact information, with the address "104, cours des fossés, 33210 Langon" and his phone number. The receipt details a payment of 55€ for a consultation dated 15/06/2020. The receipt number is 1750401922774. There are handwritten details and signatures, with the amount and date written in ink. Noël Breignaud's signature and a circular stamp are also present.
User: <image>
Please describe the image in detail.
Assistant: The image is a handwritten receipt or invoice from a practitioner named Noël Breign aud, who is an osteopath. The text on the left side of the document includes the following details:

- **Name:** Noël Breignaud
- **Profession:** Ouestopathe (Osteopath)
- **Address:** 104, cours des fossés, 33210 Lagnon
- **Phone Number:** Tel. 06 88 70 66 43

On the right side, there are registration and identification numbers:

- *

In [23]:
vout = model.vision_model(pixel_values=pixel_values, output_hidden_states=True)

In [None]:
patch_feats = vout.last_hidden_state             # [B, N_patches, Dv], Dv ~ 1024 theo kiến trúc của bạn
print(patch_feats.shape)
# Nếu backbone có CLS token, bạn có thể dùng patch_feats[:,0]
# Cách an toàn chung: mean-pool
# img_vec = patch_feats.mean(dim=1)                # [B, Dv]
# img_vec = torch.nn.functional.normalize(img_vec, dim=-1)  # L2 normalize cho retrieval

torch.Size([7, 1025, 1024])


In [None]:
# img_vec.shape

torch.Size([7, 1024])

In [14]:
model

InternVLChatModel(
  (vision_model): InternVisionModel(
    (embeddings): InternVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InternVisionEncoder(
      (layers): ModuleList(
        (0): InternVisionEncoderLayer(
          (attn): InternAttention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (inner_attn): FlashAttention()
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (mlp): InternMLP(
            (act): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affin

In [None]:
import fiftyone as fo
import fiftyone.brain as fob
import numpy as np
from sklearn.mixture import GaussianMixture
import json

DATASET_NAME = "mock"

json_path = "./embeddings_factures_osteopathie_1k_qwen.json"

with open(json_path, "r") as file:
    embedding_data = json.load(file)

file_paths = []
embeddings = []
for i, record in enumerate(embedding_data):
    file_paths.append(record.get("filepath"))
    embeddings.append(record.get("embedding"))

if DATASET_NAME in fo.list_datasets():
    dataset = fo.load_dataset(DATASET_NAME)
    dataset.delete()
dataset = fo.Dataset(DATASET_NAME)

# Add samples to the dataset
samples = [fo.Sample(filepath=p) for p in file_paths]
dataset.add_samples(samples)

# Building Gaussian mixture model (GMM)
n_gaussians = 50
gmm = GaussianMixture(n_components=n_gaussians, random_state=42)
gmm.fit(embeddings)
cluster_labels = gmm.predict(embeddings)

# Adding labeled embeddings to visulization
dataset.add_sample_field("gmm_cluster", fo.IntField)
for sample, label in zip(dataset, cluster_labels):
    sample["gmm_cluster_50_gaussians"] = int(label)
    sample.save()

n_gaussians = 200
gmm = GaussianMixture(n_components=n_gaussians, random_state=42)
gmm.fit(embeddings)
cluster_labels = gmm.predict(embeddings)

# Adding labeled embeddings to visulization
dataset.add_sample_field("gmm_cluster", fo.IntField)
for sample, label in zip(dataset, cluster_labels):
    sample["gmm_cluster_200_gaussians"] = int(label)
    sample.save()

# --- Visualize the Embeddings with UMAP ---
# This will compute a 2D representation of your embeddings
# for visualization.
res = fob.compute_visualization(
    dataset,
    embeddings=embeddings,
    brain_key="qwen_vision_viz",
    method="tsne",
    verbose=True
)
dataset.set_values("qwen_umap", res.current_points)

print("UMAP visualization computed. Launch the app to see the plot.")
session = fo.launch_app(dataset)

 100% |███████████████| 1091/1091 [174.1ms elapsed, 0s remaining, 6.3K samples/s]   
