Files
embedding-clustering/filter/dbscan_v3.py

353 lines
14 KiB
Python
Raw Normal View History

2025-09-04 14:39:02 +00:00
#!/usr/bin/env python3
"""
DBSCAN Clustering Filter
Filters clustering results based on specific criteria, parameterized via CLI:
- For each cluster: select a configurable ratio of points (selection_ratio)
- A configurable portion from center region (center_ratio)
- A configurable portion from border region (border_ratio)
- All noise points are selected
- Uses cosine distance metric
CLI parameters added:
--selection_ratio (float, default 0.5)
--center_ratio (float, default 0.5)
--border_ratio (float, default 0.5)
Example:
python dbscan_v3.py \
--embeddings_path embeddings.json \
--clustering_results_path dbscan_results.json \
--selection_ratio 0.4 --center_ratio 0.6 --border_ratio 0.4
"""
import json
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances
import argparse
import os
from pathlib import Path
import random
class DBSCANFilter:
def __init__(self, embeddings_path, clustering_results_path,
selection_ratio=0.5, center_ratio=0.5, border_ratio=0.5):
"""Initialize DBSCAN filter
Args:
embeddings_path: Path to embeddings JSON file
clustering_results_path: Path to DBSCAN clustering results JSON
selection_ratio: Ratio of total cluster points to consider selecting
center_ratio: Ratio applied within center region (relative scaling)
border_ratio: Ratio applied within border region (relative scaling)
"""
self.embeddings_path = embeddings_path
self.clustering_results_path = clustering_results_path
self.embeddings = None
self.embeddings_normalized = None
self.clustering_results = None
self.filepath_to_embedding = {}
self.selection_ratio = selection_ratio
self.center_ratio = center_ratio
self.border_ratio = border_ratio
def load_data(self):
"""Load embeddings and clustering results"""
print("Loading embeddings...")
with open(self.embeddings_path, 'r') as f:
embeddings_data = json.load(f)
# Create mapping from filepath to embedding
embeddings_list = []
filepaths = []
for item in embeddings_data:
self.filepath_to_embedding[item['filepath']] = item['embedding']
embeddings_list.append(item['embedding'])
filepaths.append(item['filepath'])
self.embeddings = np.array(embeddings_list, dtype=np.float32)
self.embeddings_normalized = normalize(self.embeddings, norm='l2')
print(f"Loaded {len(embeddings_list)} embeddings")
print("Loading clustering results...")
with open(self.clustering_results_path, 'r') as f:
self.clustering_results = json.load(f)
print(f"Loaded clustering results: {self.clustering_results['n_clusters']} clusters, "
f"{self.clustering_results['n_samples']} samples")
def group_by_clusters(self):
"""Group data points by cluster labels"""
clusters = {}
noise_points = []
for result in self.clustering_results['results']:
cluster_id = result['cluster']
filepath = result['filepath']
if 'is_noise' in result:
is_noise = result['is_noise']
else:
is_noise = False
if is_noise or cluster_id == -1:
noise_points.append({
'filepath': filepath,
'embedding': self.filepath_to_embedding[filepath]
})
else:
if cluster_id not in clusters:
clusters[cluster_id] = []
clusters[cluster_id].append({
'filepath': filepath,
'embedding': self.filepath_to_embedding[filepath]
})
return clusters, noise_points
def calculate_cluster_centroid(self, cluster_points):
"""Calculate centroid of a cluster using normalized embeddings"""
embeddings = np.array([point['embedding'] for point in cluster_points])
embeddings_normalized = normalize(embeddings, norm='l2')
# For cosine distance, centroid is the normalized mean
centroid = np.mean(embeddings_normalized, axis=0)
centroid_normalized = normalize(centroid.reshape(1, -1), norm='l2')[0]
return centroid_normalized
def calculate_cosine_distances_to_centroid(self, cluster_points, centroid):
"""Calculate cosine distances from each point to cluster centroid"""
embeddings = np.array([point['embedding'] for point in cluster_points])
embeddings_normalized = normalize(embeddings, norm='l2')
# Calculate cosine distances to centroid
distances = cosine_distances(embeddings_normalized, centroid.reshape(1, -1)).flatten()
return distances
def filter_cluster(self, cluster_points):
"""Lọc điểm trong một cluster dựa trên các tham số đã cấu hình."""
if not cluster_points:
return []
selection_ratio = self.selection_ratio
center_ratio = self.center_ratio
border_ratio = self.border_ratio
total_points = len(cluster_points)
num_to_select = max(15, int(total_points * selection_ratio))
if num_to_select >= total_points and selection_ratio != 1:
return cluster_points
centroid = self.calculate_cluster_centroid(cluster_points)
distances = self.calculate_cosine_distances_to_centroid(cluster_points, centroid)
point_distance_pairs = list(zip(cluster_points, distances))
point_distance_pairs.sort(key=lambda x: x[1])
dis = 0.1 # ngưỡng khoảng cách để phân loại center / border
all_center_points = [p for p, d in point_distance_pairs if d < dis]
all_border_points = [p for p, d in point_distance_pairs if d >= dis]
print(f"Number of center points (distance < {dis}): {len(all_center_points)}")
print(f"Number of border points (distance >= {dis}): {len(all_border_points)}")
n_center = len(all_center_points)
n_border = len(all_border_points)
if n_center > 0:
center_count = max(1, int(n_center * center_ratio * selection_ratio))
center_count = min(center_count, n_center)
else:
center_count = 0
if n_border < 70:
border_count = n_border
else:
border_count = max(0, int(n_border * border_ratio * selection_ratio))
border_count = min(border_count, n_border)
random.seed(42)
selected_points = []
if center_count > 0:
selected_points.extend(random.sample(all_center_points, center_count))
if border_count > 0:
selected_points.extend(random.sample(all_border_points, border_count))
print(
f"Cluster with {total_points} points -> selected {len(selected_points)} points "
f"({center_count} center + {border_count} border)"
)
return selected_points
def filter_all_clusters(self):
"""Filter all clusters according to the specified criteria"""
print("\n" + "="*60)
print("FILTERING DBSCAN CLUSTERING RESULTS")
print("="*60)
clusters, noise_points = self.group_by_clusters()
print(f"Found {len(clusters)} clusters and {len(noise_points)} noise points")
filtered_results = []
# Process each cluster
for cluster_id, cluster_points in clusters.items():
print(f"\nProcessing Cluster {cluster_id}:")
filtered_points = self.filter_cluster(cluster_points)
# Add cluster information
for point in filtered_points:
filtered_results.append({
'filepath': point['filepath'],
'cluster': cluster_id,
'is_noise': False,
'selection_type': 'cluster_filtered'
})
# Add all noise points
print(f"\nAdding all {len(noise_points)} noise points...")
n_noise = len(noise_points)
noise_count = max(0, int(n_noise * self.selection_ratio))
random.seed(42)
selected_noise_points = random.sample(noise_points, noise_count)
for point in selected_noise_points:
filtered_results.append({
'filepath': point['filepath'],
'cluster': -1,
'is_noise': True,
'selection_type': 'noise'
})
return filtered_results
def save_filtered_results(self, filtered_results, output_path=None):
"""Save filtered results to JSON file"""
if output_path is None:
# Generate output filename based on input
base_name = Path(self.clustering_results_path).stem
output_path = f"{base_name}_filtered.json"
# Create summary statistics
cluster_stats = {}
noise_count = 0
for result in filtered_results:
if result['is_noise']:
noise_count += 1
else:
cluster_id = result['cluster']
if cluster_id not in cluster_stats:
cluster_stats[cluster_id] = 0
cluster_stats[cluster_id] += 1
# Prepare output data
output_data = {
"method": "DBSCAN_FILTERED",
"original_n_clusters": self.clustering_results['n_clusters'],
"original_n_samples": self.clustering_results['n_samples'],
"filtered_n_samples": len(filtered_results),
"filtering_criteria": {
"cluster_selection_ratio": self.selection_ratio,
"center_points_ratio": self.center_ratio,
"border_points_ratio": self.border_ratio,
"noise_points": "all_selected"
},
"cluster_statistics": cluster_stats,
"noise_points": noise_count,
"results": filtered_results
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=4, ensure_ascii=False)
print("\n" + "="*60)
print("FILTERING SUMMARY")
print("="*60)
print(f"Original samples: {self.clustering_results['n_samples']}")
print(f"Filtered samples: {len(filtered_results)}")
print(f"Reduction ratio: {len(filtered_results)/self.clustering_results['n_samples']:.2%}")
print("\nCluster breakdown:")
for cluster_id, count in sorted(cluster_stats.items()):
print(f" Cluster {cluster_id}: {count} points")
print(f" Noise points: {noise_count} points")
print(f"\nFiltered results saved to: {output_path}")
return output_path
def create_filepath_list(self, filtered_results, output_txt_path=None):
"""Create a simple text file with filtered filepaths"""
if output_txt_path is None:
base_name = Path(self.clustering_results_path).stem
output_txt_path = f"{base_name}_filtered_filepaths.txt"
filepaths = [result['filepath'] for result in filtered_results]
with open(output_txt_path, 'w', encoding='utf-8') as f:
for filepath in filepaths:
f.write(f"{filepath}\n")
print(f"Filepath list saved to: {output_txt_path}")
return output_txt_path
def main():
parser = argparse.ArgumentParser(description="Filter DBSCAN clustering results")
parser.add_argument("--embeddings_path", required=True,
help="Path to embeddings JSON file")
parser.add_argument("--clustering_results_path", required=True,
help="Path to DBSCAN clustering results JSON file")
parser.add_argument("--output_path",
help="Output path for filtered results (optional)")
parser.add_argument("--create_filepath_list", action="store_true",
help="Also create a simple text file with filtered filepaths")
parser.add_argument("--selection_ratio", type=float, default=0.5,
help="Overall ratio of points to sample per cluster (default: 0.5). Minimum 15 points enforced.")
parser.add_argument("--center_ratio", type=float, default=0.5,
help="Relative ratio applied to center region when sampling (default: 0.5)")
parser.add_argument("--border_ratio", type=float, default=0.5,
help="Relative ratio applied to border region when sampling (default: 0.5)")
args = parser.parse_args()
# Validate input files exist
if not os.path.exists(args.embeddings_path):
print(f"Error: Embeddings file not found: {args.embeddings_path}")
return
if not os.path.exists(args.clustering_results_path):
print(f"Error: Clustering results file not found: {args.clustering_results_path}")
return
# Initialize filter
# Initialize filter with user-provided ratios
filter_obj = DBSCANFilter(
args.embeddings_path,
args.clustering_results_path,
selection_ratio=args.selection_ratio,
center_ratio=args.center_ratio,
border_ratio=args.border_ratio
)
# Load data
filter_obj.load_data()
# Filter clusters
filtered_results = filter_obj.filter_all_clusters()
# Save results
filter_obj.save_filtered_results(filtered_results, args.output_path)
# Create filepath list if requested
if args.create_filepath_list:
filter_obj.create_filepath_list(filtered_results)
print("\nFiltering completed successfully!")
if __name__ == "__main__":
main()