Files
embedding-clustering/cluster/auto_cluster.py

712 lines
31 KiB
Python

#!/usr/bin/env python3
"""
Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation)
These methods don't require specifying the number of clusters beforehand.
"""
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import argparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import numpy as np
def value_counts(a, dropna=False):
a_flat = a.ravel()
if dropna and np.issubdtype(a.dtype, np.floating):
mask = ~np.isnan(a_flat)
a_flat = a_flat[mask]
uniq, counts = np.unique(a_flat, return_counts=True)
uniq = list(uniq)
counts = list(counts)
return dict(zip(uniq, counts))
class AutoClustering:
def __init__(self, embeddings_path):
self.embeddings_path = embeddings_path
self.embeddings = None
self.file_paths = None
self.load_embeddings()
def load_embeddings(self):
"""Load embeddings from JSON file"""
print(f"Loading embeddings from {self.embeddings_path}...")
with open(self.embeddings_path, 'r') as f:
data = json.load(f)
self.file_paths = []
embeddings_list = []
for item in data:
self.file_paths.append(item['filepath'])
embeddings_list.append(item['embedding'])
self.embeddings = np.array(embeddings_list, dtype=np.float32)
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
# Normalize embeddings using L2 normalization for cosine distance
self.embeddings_normalized = normalize(self.embeddings, norm='l2', axis=1)
print("Applied L2 normalization to embeddings")
sims = cosine_similarity(self.embeddings)
print(self.embeddings.shape)
# lấy upper triangle exclude diagonal để inspect
triu_idxs = np.triu_indices_from(sims, k=1)
dist_vals = sims[triu_idxs]
print(dist_vals.shape)
print("mean sim:", dist_vals.mean(), "std:", dist_vals.std())
def run_dbscan(self):
"""Run DBSCAN with extensive grid search for parameter estimation"""
print("\n" + "="*50)
print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH")
print("="*50)
# Method 1: K-nearest neighbors approach with multiple k values
# eps_candidates = []
# # Try different k values for nearest neighbors with cosine metric
# k_values = [5, 10, 15, 20, 25, 30]
# for k in k_values:
# k_actual = min(k, len(self.embeddings_normalized) // 4)
# if k_actual < 3:
# continue
# neighbors = NearestNeighbors(n_neighbors=k_actual, metric='cosine')
# neighbors_fit = neighbors.fit(self.embeddings_normalized)
# distances, indices = neighbors_fit.kneighbors(self.embeddings_normalized)
# # Sort distances and use k-th nearest neighbor distance
# distances = np.sort(distances, axis=0)
# kth_distances = distances[:, k_actual-1]
# # Multiple percentile thresholds for each k
# percentiles = [60, 65, 70, 75, 80, 85, 90, 95]
# for p in percentiles:
# eps_candidates.append(np.percentile(kth_distances, p))
# # Method 2: Statistical measures using cosine distances
# # Calculate cosine distances for a sample of data points
# sample_size = min(1000, len(self.embeddings_normalized))
# sample_indices = np.random.choice(len(self.embeddings_normalized), sample_size, replace=False)
# sample_data = self.embeddings_normalized[sample_indices]
# from scipy.spatial.distance import pdist
# cosine_distances = pdist(sample_data, metric='cosine')
# # Add statistical measures as eps candidates using cosine distances
# eps_candidates.extend([
# np.mean(cosine_distances) * 0.3,
# np.mean(cosine_distances) * 0.4,
# np.mean(cosine_distances) * 0.5,
# np.mean(cosine_distances) * 0.6,
# np.mean(cosine_distances) * 0.7,
# np.median(cosine_distances) * 0.3,
# np.median(cosine_distances) * 0.4,
# np.median(cosine_distances) * 0.5,
# np.median(cosine_distances) * 0.6,
# np.std(cosine_distances) * 0.5,
# np.std(cosine_distances) * 0.8,
# np.std(cosine_distances) * 1.0,
# np.std(cosine_distances) * 1.2
# ])
# Method 3: Manual eps values for cosine distances (0-2 range)
manual_eps = [0.001, 0.002, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0]
# eps_candidates.extend(manual_eps)
# eps_candidates = manual_eps
eps_candidates = [0.2]
# Remove duplicates and invalid values, then sort
eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0])))
# Extensive min_samples candidates
# min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50]
# Filter min_samples based on dataset size
# max_min_samples = len(self.embeddings_normalized) // 10 # At most 10% of data
# min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples]
min_samples_candidates = [50]
best_score = -1
best_params = None
best_labels = None
print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values")
print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}")
print("This may take a while...\n")
# Track all results for analysis
all_results = []
total_combinations = len(eps_candidates) * len(min_samples_candidates)
current_combination = 0
for eps in eps_candidates:
for min_samples in min_samples_candidates:
current_combination += 1
# Progress indicator
if current_combination % 50 == 0 or current_combination == total_combinations:
progress = (current_combination / total_combinations) * 100
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)")
try:
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
labels = dbscan.fit_predict(self.embeddings_normalized)
# Check if we have meaningful clusters
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
noise_ratio = n_noise / len(labels)
# Store result for analysis
result_info = {
'eps': eps,
'min_samples': min_samples,
'n_clusters': n_clusters,
'n_noise': n_noise,
'noise_ratio': noise_ratio
}
# Check if we have meaningful clusters
print(n_clusters, n_noise, noise_ratio, eps, min_samples)
if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points
# Calculate silhouette score (excluding noise) using cosine metric
mask = labels != -1
if np.sum(mask) > 1:
try:
score = silhouette_score(self.embeddings_normalized[mask], labels[mask], metric='cosine')
result_info['silhouette_score'] = score
# Print promising results
if score > 0.1: # Only show decent scores
print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}")
print(value_counts(labels))
if score > best_score:
best_score = score
best_params = (eps, min_samples)
best_labels = labels
except Exception:
result_info['silhouette_score'] = None
else:
result_info['silhouette_score'] = None
all_results.append(result_info)
except Exception as e:
import traceback
traceback.print_exc()
# Skip problematic parameter combinations
continue
# Analysis of results
print("\n" + "="*50)
print("DBSCAN GRID SEARCH ANALYSIS")
print("="*50)
if all_results:
# Convert to numpy for easier analysis
import pandas as pd
df_results = pd.DataFrame(all_results)
print(f"Total parameter combinations tested: {len(df_results)}")
# Valid results (with clusters)
valid_results = df_results[df_results['n_clusters'] >= 2]
print(f"Combinations that produced clusters: {len(valid_results)}")
if len(valid_results) > 0:
# Best silhouette scores
scored_results = valid_results.dropna(subset=['silhouette_score'])
if len(scored_results) > 0:
print(f"Combinations with valid silhouette scores: {len(scored_results)}")
print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}")
print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}")
# Top 5 results
top_results = scored_results.nlargest(5, 'silhouette_score')
print("\nTop 5 parameter combinations:")
for idx, row in top_results.iterrows():
print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: "
f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}")
# Cluster count distribution
cluster_counts = valid_results['n_clusters'].value_counts().sort_index()
print("\nCluster count distribution:")
for n_clusters, count in cluster_counts.items():
print(f" {n_clusters} clusters: {count} parameter combinations")
print("\n📁 SAVING DETAILED RESULTS...")
print("="*30)
# Save detailed grid search results to JSON file
self.save_dbscan_grid_search_results(all_results, best_params, best_score)
if best_labels is not None:
n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0)
n_noise = list(best_labels).count(-1)
print("\nBest DBSCAN result:")
print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}")
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)")
print(f"Silhouette score: {best_score:.4f}")
return best_labels
else:
print("DBSCAN could not find suitable clusters with the extensive grid search")
print("Consider:")
print("- Adjusting the embedding space (different model or preprocessing)")
print("- Using different clustering algorithms")
print("- Manual parameter tuning based on domain knowledge")
return None
def save_dbscan_grid_search_results(self, all_results, best_params, best_score):
"""Save detailed DBSCAN grid search results to JSON file"""
import datetime
# Prepare comprehensive results data
grid_search_data = {
"experiment_info": {
"timestamp": datetime.datetime.now().isoformat(),
"dataset_path": self.embeddings_path,
"total_samples": len(self.file_paths),
"embedding_dimension": self.embeddings.shape[1],
"total_combinations_tested": len(all_results)
},
"best_result": {
"eps": best_params[0] if best_params else None,
"min_samples": best_params[1] if best_params else None,
"silhouette_score": best_score if best_score > -1 else None
},
"all_trials": []
}
# Add all trial results
for i, result in enumerate(all_results):
trial_data = {
"trial_id": i + 1,
"parameters": {
"eps": result['eps'],
"min_samples": result['min_samples']
},
"results": {
"n_clusters": result['n_clusters'],
"n_noise": result['n_noise'],
"noise_ratio": result['noise_ratio'],
"silhouette_score": result['silhouette_score']
},
"status": "success" if result['silhouette_score'] is not None else "failed"
}
grid_search_data["all_trials"].append(trial_data)
# Calculate summary statistics
valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"]
if valid_trials:
silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None]
if silhouette_scores:
grid_search_data["summary_statistics"] = {
"total_trials": len(all_results),
"successful_trials": len(valid_trials),
"success_rate": len(valid_trials) / len(all_results),
"best_silhouette_score": max(silhouette_scores),
"worst_silhouette_score": min(silhouette_scores),
"mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores),
"median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2]
}
# Top 10 results
sorted_valid_trials = sorted(valid_trials,
key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1,
reverse=True)
grid_search_data["top_10_results"] = sorted_valid_trials[:10]
# Parameter analysis
eps_values = [t["parameters"]["eps"] for t in valid_trials]
min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials]
grid_search_data["parameter_analysis"] = {
"eps_range": {
"min": min(eps_values),
"max": max(eps_values),
"mean": sum(eps_values) / len(eps_values)
},
"min_samples_range": {
"min": min(min_samples_values),
"max": max(min_samples_values),
"mean": sum(min_samples_values) / len(min_samples_values)
}
}
# Save to file with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# filename = f"dbscan_grid_search_detailed_{timestamp}.json"
filename = "dbscan_grid_search_detailed.json"
print(grid_search_data.keys())
print(type(grid_search_data['parameter_analysis']))
with open(filename, 'w') as f:
json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
print(f"Detailed grid search results saved to: {filename}")
# Also save a CSV summary for easy analysis
# csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv"
csv_filename = "dbscan_grid_search_summary.csv"
self.save_grid_search_csv(all_results, csv_filename)
print(f"Grid search summary CSV saved to: {csv_filename}")
def save_grid_search_csv(self, all_results, filename):
"""Save grid search results as CSV for easy analysis"""
import csv
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise',
'noise_ratio', 'silhouette_score', 'status']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i, result in enumerate(all_results):
writer.writerow({
'trial_id': i + 1,
'eps': result['eps'],
'min_samples': result['min_samples'],
'n_clusters': result['n_clusters'],
'n_noise': result['n_noise'],
'noise_ratio': result['noise_ratio'],
'silhouette_score': result['silhouette_score'],
'status': 'success' if result['silhouette_score'] is not None else 'failed'
})
def run_mean_shift(self):
"""Run Mean Shift clustering"""
print("\n" + "="*50)
print("RUNNING MEAN SHIFT CLUSTERING")
print("="*50)
# Estimate bandwidth using different percentiles with cosine metric
from sklearn.cluster import estimate_bandwidth
# Try different bandwidth estimation methods
bandwidth_candidates = []
# Method 1: sklearn's estimate_bandwidth (note: estimate_bandwidth doesn't support cosine directly)
try:
bw_est = estimate_bandwidth(self.embeddings_normalized, quantile=0.3, n_samples=min(500, len(self.embeddings_normalized)))
if bw_est > 0:
bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5])
except Exception:
pass
# Method 2: nearest neighbor cosine distances
neighbors = NearestNeighbors(n_neighbors=10, metric='cosine')
neighbors_fit = neighbors.fit(self.embeddings_normalized)
distances, _ = neighbors_fit.kneighbors(self.embeddings_normalized)
mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance
bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5])
# Remove duplicates and invalid values
bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0]))
if not bandwidth_candidates:
bandwidth_candidates = [0.5, 1.0, 1.5, 2.0]
best_score = -1
best_bandwidth = None
best_labels = None
print("Testing different bandwidth values...")
for bandwidth in bandwidth_candidates:
try:
mean_shift = MeanShift(bandwidth=bandwidth)
labels = mean_shift.fit_predict(self.embeddings_normalized)
n_clusters = len(set(labels))
if 2 <= n_clusters <= len(self.embeddings_normalized) // 3:
score = silhouette_score(self.embeddings_normalized, labels, metric='cosine')
print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}")
if score > best_score:
best_score = score
best_bandwidth = bandwidth
best_labels = labels
except Exception as e:
print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)")
continue
if best_labels is not None:
n_clusters = len(set(best_labels))
print("\nBest Mean Shift result:")
print(f"Bandwidth: {best_bandwidth:.4f}")
print(f"Number of clusters: {n_clusters}")
print(f"Silhouette score: {best_score:.4f}")
return best_labels
else:
print("Mean Shift could not find suitable clusters")
return None
def run_affinity_propagation(self):
"""Run Affinity Propagation clustering"""
print("\n" + "="*50)
print("RUNNING AFFINITY PROPAGATION CLUSTERING")
print("="*50)
# Calculate similarity matrix using cosine similarity
# Convert cosine distance to cosine similarity: similarity = 1 - distance
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(self.embeddings_normalized)
# Try different preference values (percentiles of similarity matrix diagonal)
preference_candidates = [
np.percentile(similarities.diagonal(), 10),
np.percentile(similarities.diagonal(), 25),
np.percentile(similarities.diagonal(), 50),
np.median(similarities),
np.percentile(similarities.diagonal(), 75)
]
damping_candidates = [0.5, 0.7, 0.8, 0.9]
best_score = -1
best_params = None
best_labels = None
print("Testing different parameter combinations...")
for preference in preference_candidates:
for damping in damping_candidates:
try:
affinity_prop = AffinityPropagation(
preference=preference,
damping=damping,
random_state=42,
max_iter=200
)
labels = affinity_prop.fit_predict(self.embeddings_normalized)
n_clusters = len(set(labels))
if 2 <= n_clusters <= len(self.embeddings_normalized) // 3:
score = silhouette_score(self.embeddings_normalized, labels, metric='cosine')
print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}")
if score > best_score:
best_score = score
best_params = (preference, damping)
best_labels = labels
except Exception as e:
print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)")
continue
if best_labels is not None:
n_clusters = len(set(best_labels))
print("\nBest Affinity Propagation result:")
print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}")
print(f"Number of clusters: {n_clusters}")
print(f"Silhouette score: {best_score:.4f}")
return best_labels
else:
print("Affinity Propagation could not find suitable clusters")
return None
def visualize_results(self, results_dict):
"""Visualize clustering results using PCA"""
if not results_dict:
print("No results to visualize")
return
# Reduce dimensions for visualization
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(self.embeddings_normalized)
n_methods = len(results_dict)
fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4))
if n_methods == 1:
axes = [axes]
for idx, (method_name, labels) in enumerate(results_dict.items()):
# Handle noise points in DBSCAN (label -1)
unique_labels = set(labels)
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
for label, color in zip(unique_labels, colors):
if label == -1:
# Noise points in black
mask = labels == label
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
c='black', marker='x', s=20, alpha=0.5, label='Noise')
else:
mask = labels == label
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)')
axes[idx].set_xlabel('PCA Component 1')
axes[idx].set_ylabel('PCA Component 2')
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight')
plt.show()
print("\nVisualization saved as 'auto_clustering_results.png'")
def save_results(self, results_dict):
"""Save clustering results to JSON files"""
print(results_dict.items())
check_method_name = []
print(len(results_dict))
for method_name, labels in results_dict.items():
check_method_name.append(method_name)
# Create results for each method
method_results = []
print(method_name == 'DBSCAN')
for filepath, label in zip(self.file_paths, labels):
if method_name == 'DBSCAN':
if label == -1:
is_noise = True
else:
is_noise = False
else:
is_noise = False
method_results.append({
"filepath": filepath,
"cluster": int(label),
"is_noise": is_noise
})
print('method_name', set(check_method_name))
print(method_results[0]['is_noise'])
print(method_results[0])
# Save to file
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{method_name.lower().replace(' ', '_')}_results_{timestamp}.json"
with open(filename, 'w') as f:
json.dump({
"method": method_name,
"n_clusters": len(set(labels)) - (1 if -1 in labels else 0),
"n_samples": len(labels),
"results": method_results
}, f, indent=4)
print(f"Results saved to {filename}")
def run_all_methods(self):
"""Run all automatic clustering methods"""
print("\n" + "="*70)
print("AUTOMATIC CLUSTERING ANALYSIS")
print("="*70)
print(f"Dataset: {len(self.file_paths)} documents")
print(f"Embedding dimension: {self.embeddings.shape[1]}")
results = {}
# Run DBSCAN
dbscan_labels = self.run_dbscan()
if dbscan_labels is not None:
results["DBSCAN"] = dbscan_labels
# Run Mean Shift
# meanshift_labels = self.run_mean_shift()
# if meanshift_labels is not None:
# results["Mean Shift"] = meanshift_labels
# Run Affinity Propagation
# affinity_labels = self.run_affinity_propagation()
# if affinity_labels is not None:
# results["Affinity Propagation"] = affinity_labels
# Summary
if results:
print("\n" + "="*70)
print("SUMMARY OF RESULTS")
print("="*70)
for method, labels in results.items():
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
if method == "DBSCAN":
n_noise = list(labels).count(-1)
print(f"{method}: {n_clusters} clusters, {n_noise} noise points")
else:
print(f"{method}: {n_clusters} clusters")
# Calculate agreement between methods if multiple succeeded
if len(results) > 1:
from sklearn.metrics import adjusted_rand_score
print("\nMethod Agreement (Adjusted Rand Index):")
method_names = list(results.keys())
for i in range(len(method_names)):
for j in range(i+1, len(method_names)):
ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]])
print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}")
# Visualize and save results
self.visualize_results(results)
self.save_results(results)
else:
print("\nNo automatic clustering method found suitable clusters.")
print("This might indicate:")
print("- Data doesn't have clear cluster structure")
print("- Embeddings need different preprocessing")
print("- Different parameter ranges needed")
return results
def main():
parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings")
parser.add_argument("--embeddings_path", help="Path to embeddings JSON file")
parser.add_argument("--method", choices=['None', 'dbscan', 'meanshift', 'affinity', 'all'], default='all',
help="Which automatic method to run")
args = parser.parse_args()
# Initialize clustering
clustering = AutoClustering(args.embeddings_path)
# Run selected method(s)
if args.method == 'all':
clustering.run_all_methods()
elif args.method == 'dbscan':
labels = clustering.run_dbscan()
if labels is not None:
clustering.visualize_results({"DBSCAN": labels})
clustering.save_results({"DBSCAN": labels})
elif args.method == 'meanshift':
labels = clustering.run_mean_shift()
if labels is not None:
clustering.visualize_results({"Mean Shift": labels})
clustering.save_results({"Mean Shift": labels})
elif args.method == 'affinity':
labels = clustering.run_affinity_propagation()
if labels is not None:
clustering.visualize_results({"Affinity Propagation": labels})
clustering.save_results({"Affinity Propagation": labels})
elif args.method == 'None':
pass
if __name__ == "__main__":
main()