#!/usr/bin/env python3 """ Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation) These methods don't require specifying the number of clusters beforehand. """ import json import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score from sklearn.neighbors import NearestNeighbors from sklearn.decomposition import PCA import argparse import warnings warnings.filterwarnings('ignore') class AutoClustering: def __init__(self, embeddings_path): self.embeddings_path = embeddings_path self.embeddings = None self.file_paths = None self.load_embeddings() def load_embeddings(self): """Load embeddings from JSON file""" print(f"Loading embeddings from {self.embeddings_path}...") with open(self.embeddings_path, 'r') as f: data = json.load(f) self.file_paths = [] embeddings_list = [] for item in data: self.file_paths.append(item['filepath']) embeddings_list.append(item['embedding']) self.embeddings = np.array(embeddings_list, dtype=np.float32) print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}") # Standardize embeddings for better clustering self.scaler = StandardScaler() self.embeddings_scaled = self.scaler.fit_transform(self.embeddings) def run_dbscan(self): """Run DBSCAN with extensive grid search for parameter estimation""" print("\n" + "="*50) print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH") print("="*50) # Method 1: K-nearest neighbors approach with multiple k values eps_candidates = [] # Try different k values for nearest neighbors k_values = [5, 10, 15, 20, 25, 30] for k in k_values: k_actual = min(k, len(self.embeddings_scaled) // 4) if k_actual < 3: continue neighbors = NearestNeighbors(n_neighbors=k_actual) neighbors_fit = neighbors.fit(self.embeddings_scaled) distances, indices = neighbors_fit.kneighbors(self.embeddings_scaled) # Sort distances and use k-th nearest neighbor distance distances = np.sort(distances, axis=0) kth_distances = distances[:, k_actual-1] # Multiple percentile thresholds for each k percentiles = [60, 65, 70, 75, 80, 85, 90, 95] for p in percentiles: eps_candidates.append(np.percentile(kth_distances, p)) # Method 2: Statistical measures # Mean and std of pairwise distances (sampled for efficiency) sample_size = min(1000, len(self.embeddings_scaled)) sample_indices = np.random.choice(len(self.embeddings_scaled), sample_size, replace=False) sample_data = self.embeddings_scaled[sample_indices] from scipy.spatial.distance import pdist pairwise_distances = pdist(sample_data) # Add statistical measures as eps candidates eps_candidates.extend([ np.mean(pairwise_distances) * 0.3, np.mean(pairwise_distances) * 0.4, np.mean(pairwise_distances) * 0.5, np.mean(pairwise_distances) * 0.6, np.mean(pairwise_distances) * 0.7, np.median(pairwise_distances) * 0.3, np.median(pairwise_distances) * 0.4, np.median(pairwise_distances) * 0.5, np.median(pairwise_distances) * 0.6, np.std(pairwise_distances) * 0.5, np.std(pairwise_distances) * 0.8, np.std(pairwise_distances) * 1.0, np.std(pairwise_distances) * 1.2 ]) # Method 3: Manual eps values for different scales manual_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.5, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0] eps_candidates.extend(manual_eps) # Remove duplicates and invalid values, then sort eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0]))) # Extensive min_samples candidates min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50] # Filter min_samples based on dataset size # max_min_samples = len(self.embeddings_scaled) // 10 # At most 10% of data # min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples] min_samples_candidates = [60] best_score = -1 best_params = None best_labels = None print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values") print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}") print("This may take a while...\n") # Track all results for analysis all_results = [] total_combinations = len(eps_candidates) * len(min_samples_candidates) current_combination = 0 for eps in eps_candidates: for min_samples in min_samples_candidates: current_combination += 1 # Progress indicator if current_combination % 50 == 0 or current_combination == total_combinations: progress = (current_combination / total_combinations) * 100 print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)") try: dbscan = DBSCAN(eps=eps, min_samples=min_samples) labels = dbscan.fit_predict(self.embeddings_scaled) # Check if we have meaningful clusters n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) noise_ratio = n_noise / len(labels) # Store result for analysis result_info = { 'eps': eps, 'min_samples': min_samples, 'n_clusters': n_clusters, 'n_noise': n_noise, 'noise_ratio': noise_ratio } # Check if we have meaningful clusters if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points # Calculate silhouette score (excluding noise) mask = labels != -1 if np.sum(mask) > 1: try: score = silhouette_score(self.embeddings_scaled[mask], labels[mask]) result_info['silhouette_score'] = score # Print promising results if score > 0.1: # Only show decent scores print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}") if score > best_score: best_score = score best_params = (eps, min_samples) best_labels = labels except Exception: result_info['silhouette_score'] = None else: result_info['silhouette_score'] = None all_results.append(result_info) except Exception as e: # Skip problematic parameter combinations continue # Analysis of results print("\n" + "="*50) print("DBSCAN GRID SEARCH ANALYSIS") print("="*50) if all_results: # Convert to numpy for easier analysis import pandas as pd df_results = pd.DataFrame(all_results) print(f"Total parameter combinations tested: {len(df_results)}") # Valid results (with clusters) valid_results = df_results[df_results['n_clusters'] >= 2] print(f"Combinations that produced clusters: {len(valid_results)}") if len(valid_results) > 0: # Best silhouette scores scored_results = valid_results.dropna(subset=['silhouette_score']) if len(scored_results) > 0: print(f"Combinations with valid silhouette scores: {len(scored_results)}") print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}") print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}") # Top 5 results top_results = scored_results.nlargest(5, 'silhouette_score') print("\nTop 5 parameter combinations:") for idx, row in top_results.iterrows(): print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: " f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}") # Cluster count distribution cluster_counts = valid_results['n_clusters'].value_counts().sort_index() print(f"\nCluster count distribution:") for n_clusters, count in cluster_counts.items(): print(f" {n_clusters} clusters: {count} parameter combinations") print(f"\nšŸ“ SAVING DETAILED RESULTS...") print("="*30) # Save detailed grid search results to JSON file self.save_dbscan_grid_search_results(all_results, best_params, best_score) if best_labels is not None: n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0) n_noise = list(best_labels).count(-1) print(f"\nBest DBSCAN result:") print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}") print(f"Number of clusters: {n_clusters}") print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)") print(f"Silhouette score: {best_score:.4f}") return best_labels else: print("DBSCAN could not find suitable clusters with the extensive grid search") print("Consider:") print("- Adjusting the embedding space (different model or preprocessing)") print("- Using different clustering algorithms") print("- Manual parameter tuning based on domain knowledge") return None def save_dbscan_grid_search_results(self, all_results, best_params, best_score): """Save detailed DBSCAN grid search results to JSON file""" import datetime # Prepare comprehensive results data grid_search_data = { "experiment_info": { "timestamp": datetime.datetime.now().isoformat(), "dataset_path": self.embeddings_path, "total_samples": len(self.file_paths), "embedding_dimension": self.embeddings.shape[1], "total_combinations_tested": len(all_results) }, "best_result": { "eps": best_params[0] if best_params else None, "min_samples": best_params[1] if best_params else None, "silhouette_score": best_score if best_score > -1 else None }, "all_trials": [] } # Add all trial results for i, result in enumerate(all_results): trial_data = { "trial_id": i + 1, "parameters": { "eps": result['eps'], "min_samples": result['min_samples'] }, "results": { "n_clusters": result['n_clusters'], "n_noise": result['n_noise'], "noise_ratio": result['noise_ratio'], "silhouette_score": result['silhouette_score'] }, "status": "success" if result['silhouette_score'] is not None else "failed" } grid_search_data["all_trials"].append(trial_data) # Calculate summary statistics valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"] if valid_trials: silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None] if silhouette_scores: grid_search_data["summary_statistics"] = { "total_trials": len(all_results), "successful_trials": len(valid_trials), "success_rate": len(valid_trials) / len(all_results), "best_silhouette_score": max(silhouette_scores), "worst_silhouette_score": min(silhouette_scores), "mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores), "median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2] } # Top 10 results sorted_valid_trials = sorted(valid_trials, key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1, reverse=True) grid_search_data["top_10_results"] = sorted_valid_trials[:10] # Parameter analysis eps_values = [t["parameters"]["eps"] for t in valid_trials] min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials] grid_search_data["parameter_analysis"] = { "eps_range": { "min": min(eps_values), "max": max(eps_values), "mean": sum(eps_values) / len(eps_values) }, "min_samples_range": { "min": min(min_samples_values), "max": max(min_samples_values), "mean": sum(min_samples_values) / len(min_samples_values) } } # Save to file with timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"dbscan_grid_search_detailed_{timestamp}.json" with open(filename, 'w') as f: json.dump(grid_search_data, f, indent=4, ensure_ascii=False) print(f"Detailed grid search results saved to: {filename}") # Also save a CSV summary for easy analysis csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv" self.save_grid_search_csv(all_results, csv_filename) print(f"Grid search summary CSV saved to: {csv_filename}") def save_grid_search_csv(self, all_results, filename): """Save grid search results as CSV for easy analysis""" import csv with open(filename, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise', 'noise_ratio', 'silhouette_score', 'status'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i, result in enumerate(all_results): writer.writerow({ 'trial_id': i + 1, 'eps': result['eps'], 'min_samples': result['min_samples'], 'n_clusters': result['n_clusters'], 'n_noise': result['n_noise'], 'noise_ratio': result['noise_ratio'], 'silhouette_score': result['silhouette_score'], 'status': 'success' if result['silhouette_score'] is not None else 'failed' }) def run_mean_shift(self): """Run Mean Shift clustering""" print("\n" + "="*50) print("RUNNING MEAN SHIFT CLUSTERING") print("="*50) # Estimate bandwidth using different percentiles from sklearn.cluster import estimate_bandwidth # Try different bandwidth estimation methods bandwidth_candidates = [] # Method 1: sklearn's estimate_bandwidth try: bw_est = estimate_bandwidth(self.embeddings_scaled, quantile=0.3, n_samples=min(500, len(self.embeddings_scaled))) if bw_est > 0: bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5]) except: pass # Method 2: nearest neighbor distances neighbors = NearestNeighbors(n_neighbors=10) neighbors_fit = neighbors.fit(self.embeddings_scaled) distances, _ = neighbors_fit.kneighbors(self.embeddings_scaled) mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5]) # Remove duplicates and invalid values bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0])) if not bandwidth_candidates: bandwidth_candidates = [0.5, 1.0, 1.5, 2.0] best_score = -1 best_bandwidth = None best_labels = None print("Testing different bandwidth values...") for bandwidth in bandwidth_candidates: try: mean_shift = MeanShift(bandwidth=bandwidth) labels = mean_shift.fit_predict(self.embeddings_scaled) n_clusters = len(set(labels)) if 2 <= n_clusters <= len(self.embeddings_scaled) // 3: score = silhouette_score(self.embeddings_scaled, labels) print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}") if score > best_score: best_score = score best_bandwidth = bandwidth best_labels = labels except Exception as e: print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)") continue if best_labels is not None: n_clusters = len(set(best_labels)) print(f"\nBest Mean Shift result:") print(f"Bandwidth: {best_bandwidth:.4f}") print(f"Number of clusters: {n_clusters}") print(f"Silhouette score: {best_score:.4f}") return best_labels else: print("Mean Shift could not find suitable clusters") return None def run_affinity_propagation(self): """Run Affinity Propagation clustering""" print("\n" + "="*50) print("RUNNING AFFINITY PROPAGATION CLUSTERING") print("="*50) # Calculate similarity matrix similarities = -np.sum((self.embeddings_scaled[:, np.newaxis] - self.embeddings_scaled)**2, axis=2) # Try different preference values (percentiles of similarity matrix diagonal) preference_candidates = [ np.percentile(similarities.diagonal(), 10), np.percentile(similarities.diagonal(), 25), np.percentile(similarities.diagonal(), 50), np.median(similarities), np.percentile(similarities.diagonal(), 75) ] damping_candidates = [0.5, 0.7, 0.8, 0.9] best_score = -1 best_params = None best_labels = None print("Testing different parameter combinations...") for preference in preference_candidates: for damping in damping_candidates: try: affinity_prop = AffinityPropagation( preference=preference, damping=damping, random_state=42, max_iter=200 ) labels = affinity_prop.fit_predict(self.embeddings_scaled) n_clusters = len(set(labels)) if 2 <= n_clusters <= len(self.embeddings_scaled) // 3: score = silhouette_score(self.embeddings_scaled, labels) print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}") if score > best_score: best_score = score best_params = (preference, damping) best_labels = labels except Exception as e: print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)") continue if best_labels is not None: n_clusters = len(set(best_labels)) print(f"\nBest Affinity Propagation result:") print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}") print(f"Number of clusters: {n_clusters}") print(f"Silhouette score: {best_score:.4f}") return best_labels else: print("Affinity Propagation could not find suitable clusters") return None def visualize_results(self, results_dict): """Visualize clustering results using PCA""" if not results_dict: print("No results to visualize") return # Reduce dimensions for visualization pca = PCA(n_components=2, random_state=42) embeddings_2d = pca.fit_transform(self.embeddings_scaled) n_methods = len(results_dict) fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4)) if n_methods == 1: axes = [axes] for idx, (method_name, labels) in enumerate(results_dict.items()): # Handle noise points in DBSCAN (label -1) unique_labels = set(labels) colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels))) for label, color in zip(unique_labels, colors): if label == -1: # Noise points in black mask = labels == label axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], c='black', marker='x', s=20, alpha=0.5, label='Noise') else: mask = labels == label axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], c=[color], s=50, alpha=0.7, label=f'Cluster {label}') axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)') axes[idx].set_xlabel('PCA Component 1') axes[idx].set_ylabel('PCA Component 2') axes[idx].grid(True, alpha=0.3) plt.tight_layout() plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight') plt.show() print(f"\nVisualization saved as 'auto_clustering_results.png'") def save_results(self, results_dict): """Save clustering results to JSON files""" print(results_dict.items()) check_method_name = [] print(len(results_dict)) for method_name, labels in results_dict.items(): check_method_name.append(method_name) # Create results for each method method_results = [] print(method_name == 'DBSCAN') for filepath, label in zip(self.file_paths, labels): if method_name == 'DBSCAN': if label == -1: is_noise = True else: is_noise = False else: is_noise = False method_results.append({ "filepath": filepath, "cluster": int(label), "is_noise": is_noise }) print('method_name', set(check_method_name)) print(method_results[0]['is_noise']) print(method_results[0]) # Save to file filename = f"{method_name.lower().replace(' ', '_')}_results.json" with open(filename, 'w') as f: json.dump({ "method": method_name, "n_clusters": len(set(labels)) - (1 if -1 in labels else 0), "n_samples": len(labels), "results": method_results }, f, indent=4) print(f"Results saved to {filename}") def run_all_methods(self): """Run all automatic clustering methods""" print("\n" + "="*70) print("AUTOMATIC CLUSTERING ANALYSIS") print("="*70) print(f"Dataset: {len(self.file_paths)} documents") print(f"Embedding dimension: {self.embeddings.shape[1]}") results = {} # Run DBSCAN dbscan_labels = self.run_dbscan() if dbscan_labels is not None: results["DBSCAN"] = dbscan_labels # Run Mean Shift # meanshift_labels = self.run_mean_shift() # if meanshift_labels is not None: # results["Mean Shift"] = meanshift_labels # Run Affinity Propagation # affinity_labels = self.run_affinity_propagation() # if affinity_labels is not None: # results["Affinity Propagation"] = affinity_labels # Summary if results: print("\n" + "="*70) print("SUMMARY OF RESULTS") print("="*70) for method, labels in results.items(): n_clusters = len(set(labels)) - (1 if -1 in labels else 0) if method == "DBSCAN": n_noise = list(labels).count(-1) print(f"{method}: {n_clusters} clusters, {n_noise} noise points") else: print(f"{method}: {n_clusters} clusters") # Calculate agreement between methods if multiple succeeded if len(results) > 1: from sklearn.metrics import adjusted_rand_score print("\nMethod Agreement (Adjusted Rand Index):") method_names = list(results.keys()) for i in range(len(method_names)): for j in range(i+1, len(method_names)): ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]]) print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}") # Visualize and save results self.visualize_results(results) self.save_results(results) else: print("\nNo automatic clustering method found suitable clusters.") print("This might indicate:") print("- Data doesn't have clear cluster structure") print("- Embeddings need different preprocessing") print("- Different parameter ranges needed") return results def main(): parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings") parser.add_argument("--embeddings_path", help="Path to embeddings JSON file") parser.add_argument("--method", choices=['dbscan', 'meanshift', 'affinity', 'all'], default='all', help="Which automatic method to run") args = parser.parse_args() # Initialize clustering clustering = AutoClustering(args.embeddings_path) # Run selected method(s) if args.method == 'all': clustering.run_all_methods() elif args.method == 'dbscan': labels = clustering.run_dbscan() if labels is not None: clustering.visualize_results({"DBSCAN": labels}) clustering.save_results({"DBSCAN": labels}) elif args.method == 'meanshift': labels = clustering.run_mean_shift() if labels is not None: clustering.visualize_results({"Mean Shift": labels}) clustering.save_results({"Mean Shift": labels}) elif args.method == 'affinity': labels = clustering.run_affinity_propagation() if labels is not None: clustering.visualize_results({"Affinity Propagation": labels}) clustering.save_results({"Affinity Propagation": labels}) if __name__ == "__main__": main()