#!/usr/bin/env python3 """ Extensive Gaussian Mixture Model clustering with grid search for optimal parameters Includes BIC and AIC metrics for model selection """ import json import numpy as np import matplotlib.pyplot as plt from sklearn.mixture import GaussianMixture from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score from sklearn.decomposition import PCA import datetime import csv import argparse import warnings warnings.filterwarnings('ignore') class GMMExtensiveClustering: def __init__(self, embeddings_path): self.embeddings_path = embeddings_path self.embeddings = None self.file_paths = None self.load_embeddings() def load_embeddings(self): """Load embeddings from JSON file""" print(f"Loading embeddings from {self.embeddings_path}...") with open(self.embeddings_path, 'r') as f: data = json.load(f) self.file_paths = [] embeddings_list = [] for item in data: self.file_paths.append(item['filepath']) embeddings_list.append(item['embedding']) self.embeddings = np.array(embeddings_list, dtype=np.float32) print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}") # Standardize embeddings for better clustering self.scaler = StandardScaler() self.embeddings_scaled = self.scaler.fit_transform(self.embeddings) def run_gmm_grid_search(self): """Run GMM with optimized grid search for faster execution""" print("\n" + "="*70) print("RUNNING GAUSSIAN MIXTURE MODEL CLUSTERING WITH OPTIMIZED GRID SEARCH") print("="*70) # Optimized GMM parameter candidates for faster execution # Smart n_components range with larger steps max_components = min(50, len(self.embeddings_scaled) // 20) # Reduced max and increased divisor n_components_candidates = [] # Progressive step sizes: smaller steps for low numbers, larger for high for n in range(2, max_components + 1): if n <= 5: n_components_candidates.append(n) # 2, 3, 4, 5 elif n <= 10: if n % 2 == 0: # 6, 8, 10 n_components_candidates.append(n) else: if n % 3 == 2: # 11, 14, 17, 20 n_components_candidates.append(n) # Reduced covariance types - focus on most important ones covariance_types = [ # 'full', 'diag', 'tied', 'spherical' ] # Removed 'tied' and 'spherical' as they're less common # Simplified regularization - focus on key values reg_covar_candidates = [1e-5, 1e-4, 1e-3] # Removed extreme values # Reduced n_init - 1 is often sufficient for good initialization methods n_init_candidates = [1, 5] # Removed 10 to save time # Focus on best initialization methods init_params_candidates = ['kmeans', 'k-means++'] # Removed 'random' and 'random_from_data' # Simplified max_iter - most problems converge quickly max_iter_candidates = [100, 300] # Removed 500, added 300 as middle ground print(f"Optimized parameter combinations:") print(f" - n_components: {len(n_components_candidates)} values {n_components_candidates}") print(f" - covariance_types: {len(covariance_types)} options {covariance_types}") print(f" - reg_covar: {len(reg_covar_candidates)} values {reg_covar_candidates}") print(f" - n_init: {len(n_init_candidates)} values {n_init_candidates}") print(f" - init_params: {len(init_params_candidates)} options {init_params_candidates}") print(f" - max_iter: {len(max_iter_candidates)} values {max_iter_candidates}") total_combinations = (len(n_components_candidates) * len(covariance_types) * len(reg_covar_candidates) * len(n_init_candidates) * len(init_params_candidates) * len(max_iter_candidates)) print(f"Total combinations: {total_combinations} (optimized for speed)") # Estimate time estimated_time_per_combination = 0.5 # seconds (conservative estimate) estimated_total_time = total_combinations * estimated_time_per_combination print(f"Estimated runtime: {estimated_total_time/60:.1f} minutes") print("This should be much faster...\n") # Track all results for analysis all_results = [] # Early stopping criteria for speed optimization early_stopping_threshold = 0.7 # If we find a very good silhouette score, we can be less exhaustive good_results_found = 0 max_good_results = 5 # Stop early if we find several very good results best_bic_score = float('inf') best_aic_score = float('inf') best_silhouette_score = -1 best_params_bic = None best_params_aic = None best_params_silhouette = None best_labels_bic = None best_labels_aic = None best_labels_silhouette = None current_combination = 0 # Optimized iteration order: test simpler models first (fewer components, simpler covariance) for covariance_type in covariance_types: # Start with covariance type for n_components in n_components_candidates: # Then components for init_params in init_params_candidates: # Good initialization methods for reg_covar in reg_covar_candidates: # Regularization for n_init in n_init_candidates: # Number of initializations for max_iter in max_iter_candidates: # Iterations last current_combination += 1 # Progress indicator with time estimation if current_combination % 50 == 0 or current_combination == total_combinations: progress = (current_combination / total_combinations) * 100 print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%) - " f"Best scores so far: BIC={best_bic_score:.2f}, Silhouette={best_silhouette_score:.3f}") try: # Early convergence check for faster models tol = 1e-3 if n_components <= 5 else 1e-4 # Less strict tolerance for simple models # Run GMM gmm = GaussianMixture( n_components=n_components, covariance_type=covariance_type, reg_covar=reg_covar, n_init=n_init, init_params=init_params, max_iter=max_iter, tol=tol, # Added tolerance for faster convergence random_state=42 ) # Fit and predict gmm.fit(self.embeddings_scaled) labels = gmm.predict(self.embeddings_scaled) # Quick validation - skip if model didn't converge properly if not gmm.converged_ and max_iter <= 100: continue # Skip non-converged simple models # Calculate metrics bic_score = gmm.bic(self.embeddings_scaled) aic_score = gmm.aic(self.embeddings_scaled) log_likelihood = gmm.score(self.embeddings_scaled) # Only calculate clustering metrics if we have multiple clusters if len(set(labels)) > 1: silhouette = silhouette_score(self.embeddings_scaled, labels) calinski_harabasz = calinski_harabasz_score(self.embeddings_scaled, labels) davies_bouldin = davies_bouldin_score(self.embeddings_scaled, labels) # Early stopping check if silhouette > early_stopping_threshold: good_results_found += 1 print(f"šŸŽÆ Excellent result found: n_comp={n_components}, cov={covariance_type}, " f"silhouette={silhouette:.4f}") else: silhouette = -1 calinski_harabasz = 0 davies_bouldin = float('inf') # Store result for analysis result_info = { 'n_components': n_components, 'covariance_type': covariance_type, 'reg_covar': reg_covar, 'n_init': n_init, 'init_params': init_params, 'max_iter': max_iter, 'bic_score': bic_score, 'aic_score': aic_score, 'log_likelihood': log_likelihood, 'silhouette_score': silhouette, 'calinski_harabasz_score': calinski_harabasz, 'davies_bouldin_score': davies_bouldin, 'converged': gmm.converged_, 'n_iter': gmm.n_iter_, 'unique_clusters': len(set(labels)) } all_results.append(result_info) # Print promising results if (silhouette > 0.3 and bic_score < np.percentile([r['bic_score'] for r in all_results], 25)): print(f"n_components={n_components}, cov={covariance_type}, init={init_params}: " f"BIC={bic_score:.2f}, AIC={aic_score:.2f}, silhouette={silhouette:.4f}") # Track best results for different criteria if bic_score < best_bic_score: best_bic_score = bic_score best_params_bic = { 'n_components': n_components, 'covariance_type': covariance_type, 'reg_covar': reg_covar, 'n_init': n_init, 'init_params': init_params, 'max_iter': max_iter } best_labels_bic = labels if aic_score < best_aic_score: best_aic_score = aic_score best_params_aic = { 'n_components': n_components, 'covariance_type': covariance_type, 'reg_covar': reg_covar, 'n_init': n_init, 'init_params': init_params, 'max_iter': max_iter } best_labels_aic = labels if silhouette > best_silhouette_score and len(set(labels)) > 1: best_silhouette_score = silhouette best_params_silhouette = { 'n_components': n_components, 'covariance_type': covariance_type, 'reg_covar': reg_covar, 'n_init': n_init, 'init_params': init_params, 'max_iter': max_iter } best_labels_silhouette = labels # Early stopping check if good_results_found >= 5 and silhouette > 0.6: print(f"šŸ›‘ Early stopping triggered: Found {good_results_found} excellent results. " f"Stopping at {current_combination}/{total_combinations} combinations.") break except Exception: # Skip problematic parameter combinations continue # Break from nested loops if early stopping triggered if good_results_found >= 5 and best_silhouette_score > 0.6: break if good_results_found >= 5 and best_silhouette_score > 0.6: break if good_results_found >= 5 and best_silhouette_score > 0.6: break if good_results_found >= 5 and best_silhouette_score > 0.6: break if good_results_found >= 5 and best_silhouette_score > 0.6: break # Analysis of results print("\n" + "="*70) print("GAUSSIAN MIXTURE MODEL GRID SEARCH ANALYSIS") print("="*70) if all_results: import pandas as pd df_results = pd.DataFrame(all_results) print(f"Total parameter combinations tested: {len(df_results)}") # Filter results with valid clustering (more than 1 cluster) valid_results = df_results[df_results['unique_clusters'] > 1] print(f"Combinations with valid clustering: {len(valid_results)}") if len(valid_results) > 0: # Best scores analysis print(f"\nModel Selection Metrics:") print(f"Best BIC score: {df_results['bic_score'].min():.2f}") print(f"Best AIC score: {df_results['aic_score'].min():.2f}") print(f"Best Log-Likelihood: {df_results['log_likelihood'].max():.2f}") print(f"\nClustering Quality Metrics:") print(f"Best silhouette score: {valid_results['silhouette_score'].max():.4f}") print(f"Mean silhouette score: {valid_results['silhouette_score'].mean():.4f}") print(f"Best Calinski-Harabasz score: {valid_results['calinski_harabasz_score'].max():.2f}") print(f"Best Davies-Bouldin score: {valid_results['davies_bouldin_score'].min():.4f}") # Top results by different criteria print(f"\nTop 5 results by BIC (lower is better):") top_bic = df_results.nsmallest(5, 'bic_score') for idx, row in top_bic.iterrows(): print(f" n_comp={row['n_components']}, cov={row['covariance_type']}: " f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}") print(f"\nTop 5 results by AIC (lower is better):") top_aic = df_results.nsmallest(5, 'aic_score') for idx, row in top_aic.iterrows(): print(f" n_comp={row['n_components']}, cov={row['covariance_type']}: " f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}") if len(valid_results) > 0: print(f"\nTop 5 results by Silhouette Score:") top_silhouette = valid_results.nlargest(5, 'silhouette_score') for idx, row in top_silhouette.iterrows(): print(f" n_comp={row['n_components']}, cov={row['covariance_type']}: " f"silhouette={row['silhouette_score']:.4f}") # Component count analysis component_performance = df_results.groupby('n_components').agg({ 'bic_score': 'min', 'aic_score': 'min', 'silhouette_score': 'max' }).reset_index() print(f"\nComponent count analysis (top 10 by BIC):") top_components = component_performance.nsmallest(10, 'bic_score') for idx, row in top_components.iterrows(): print(f" {row['n_components']} components: " f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}, " f"silhouette={row['silhouette_score']:.4f}") print(f"\nšŸ“ SAVING DETAILED RESULTS...") print("="*30) # Save detailed grid search results self.save_gmm_grid_search_results(all_results, best_params_bic, best_bic_score, best_params_aic, best_aic_score, best_params_silhouette, best_silhouette_score) # Return best results based on BIC (primary), AIC (secondary), Silhouette (tertiary) results = { 'bic': (best_labels_bic, best_params_bic, best_bic_score), 'aic': (best_labels_aic, best_params_aic, best_aic_score), 'silhouette': (best_labels_silhouette, best_params_silhouette, best_silhouette_score) } # Print best results if best_labels_bic is not None: print(f"\nBest GMM result by BIC:") print(f"Parameters: {best_params_bic}") print(f"BIC score: {best_bic_score:.2f}") if best_labels_aic is not None: print(f"\nBest GMM result by AIC:") print(f"Parameters: {best_params_aic}") print(f"AIC score: {best_aic_score:.2f}") if best_labels_silhouette is not None: print(f"\nBest GMM result by Silhouette:") print(f"Parameters: {best_params_silhouette}") print(f"Silhouette score: {best_silhouette_score:.4f}") return results def save_gmm_grid_search_results(self, all_results, best_params_bic, best_bic_score, best_params_aic, best_aic_score, best_params_silhouette, best_silhouette_score): """Save detailed GMM grid search results to JSON file""" # Prepare comprehensive results data grid_search_data = { "experiment_info": { "timestamp": datetime.datetime.now().isoformat(), "dataset_path": self.embeddings_path, "total_samples": len(self.file_paths), "embedding_dimension": self.embeddings.shape[1], "total_combinations_tested": len(all_results), "method": "Gaussian Mixture Model" }, "best_results": { "by_bic": { "parameters": best_params_bic, "bic_score": best_bic_score if best_bic_score != float('inf') else None }, "by_aic": { "parameters": best_params_aic, "aic_score": best_aic_score if best_aic_score != float('inf') else None }, "by_silhouette": { "parameters": best_params_silhouette, "silhouette_score": best_silhouette_score if best_silhouette_score > -1 else None } }, "all_trials": [] } # Add all trial results for i, result in enumerate(all_results): trial_data = { "trial_id": i + 1, "parameters": { "n_components": result['n_components'], "covariance_type": result['covariance_type'], "reg_covar": result['reg_covar'], "n_init": result['n_init'], "init_params": result['init_params'], "max_iter": result['max_iter'] }, "results": { "bic_score": result['bic_score'], "aic_score": result['aic_score'], "log_likelihood": result['log_likelihood'], "silhouette_score": result['silhouette_score'], "calinski_harabasz_score": result['calinski_harabasz_score'], "davies_bouldin_score": result['davies_bouldin_score'], "converged": result['converged'], "n_iter": result['n_iter'], "unique_clusters": result['unique_clusters'] } } grid_search_data["all_trials"].append(trial_data) # Calculate summary statistics if all_results: bic_scores = [r['bic_score'] for r in all_results] aic_scores = [r['aic_score'] for r in all_results] log_likelihoods = [r['log_likelihood'] for r in all_results] valid_silhouette = [r['silhouette_score'] for r in all_results if r['silhouette_score'] > -1] grid_search_data["summary_statistics"] = { "total_trials": len(all_results), "valid_clustering_trials": len(valid_silhouette), "bic_score": { "best": min(bic_scores), "worst": max(bic_scores), "mean": sum(bic_scores) / len(bic_scores), "median": sorted(bic_scores)[len(bic_scores)//2] }, "aic_score": { "best": min(aic_scores), "worst": max(aic_scores), "mean": sum(aic_scores) / len(aic_scores), "median": sorted(aic_scores)[len(aic_scores)//2] }, "log_likelihood": { "best": max(log_likelihoods), "worst": min(log_likelihoods), "mean": sum(log_likelihoods) / len(log_likelihoods) } } if valid_silhouette: grid_search_data["summary_statistics"]["silhouette_score"] = { "best": max(valid_silhouette), "worst": min(valid_silhouette), "mean": sum(valid_silhouette) / len(valid_silhouette), "median": sorted(valid_silhouette)[len(valid_silhouette)//2] } # Top 10 results by different criteria sorted_by_bic = sorted(all_results, key=lambda x: x['bic_score']) sorted_by_aic = sorted(all_results, key=lambda x: x['aic_score']) valid_results = [r for r in all_results if r['silhouette_score'] > -1] sorted_by_silhouette = sorted(valid_results, key=lambda x: x['silhouette_score'], reverse=True) grid_search_data["top_10_results"] = { "by_bic": [], "by_aic": [], "by_silhouette": [] } for i, result in enumerate(sorted_by_bic[:10]): grid_search_data["top_10_results"]["by_bic"].append({ "rank": i + 1, "parameters": { "n_components": result['n_components'], "covariance_type": result['covariance_type'], "init_params": result['init_params'] }, "bic_score": result['bic_score'], "aic_score": result['aic_score'] }) for i, result in enumerate(sorted_by_aic[:10]): grid_search_data["top_10_results"]["by_aic"].append({ "rank": i + 1, "parameters": { "n_components": result['n_components'], "covariance_type": result['covariance_type'], "init_params": result['init_params'] }, "bic_score": result['bic_score'], "aic_score": result['aic_score'] }) for i, result in enumerate(sorted_by_silhouette[:10]): grid_search_data["top_10_results"]["by_silhouette"].append({ "rank": i + 1, "parameters": { "n_components": result['n_components'], "covariance_type": result['covariance_type'], "init_params": result['init_params'] }, "silhouette_score": result['silhouette_score'] }) # Save to file with timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"gmm_grid_search_detailed_{timestamp}.json" # print() # with open(filename, 'w') as f: # json.dump(grid_search_data, f, indent=4, ensure_ascii=False) print(f"Detailed grid search results saved to: {filename}") # Also save a CSV summary for easy analysis csv_filename = f"gmm_grid_search_summary_{timestamp}.csv" self.save_grid_search_csv(all_results, csv_filename) print(f"Grid search summary CSV saved to: {csv_filename}") def save_grid_search_csv(self, all_results, filename): """Save grid search results as CSV for easy analysis""" with open(filename, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['trial_id', 'n_components', 'covariance_type', 'reg_covar', 'n_init', 'init_params', 'max_iter', 'bic_score', 'aic_score', 'log_likelihood', 'silhouette_score', 'calinski_harabasz_score', 'davies_bouldin_score', 'converged', 'n_iter', 'unique_clusters'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i, result in enumerate(all_results): writer.writerow({ 'trial_id': i + 1, 'n_components': result['n_components'], 'covariance_type': result['covariance_type'], 'reg_covar': result['reg_covar'], 'n_init': result['n_init'], 'init_params': result['init_params'], 'max_iter': result['max_iter'], 'bic_score': result['bic_score'], 'aic_score': result['aic_score'], 'log_likelihood': result['log_likelihood'], 'silhouette_score': result['silhouette_score'], 'calinski_harabasz_score': result['calinski_harabasz_score'], 'davies_bouldin_score': result['davies_bouldin_score'], 'converged': result['converged'], 'n_iter': result['n_iter'], 'unique_clusters': result['unique_clusters'] }) def visualize_results(self, results): """Visualize clustering results using PCA""" fig, axes = plt.subplots(1, 3, figsize=(18, 6)) # Reduce dimensions for visualization pca = PCA(n_components=2, random_state=42) embeddings_2d = pca.fit_transform(self.embeddings_scaled) methods = ['bic', 'aic', 'silhouette'] titles = ['Best by BIC', 'Best by AIC', 'Best by Silhouette'] for idx, (method, title) in enumerate(zip(methods, titles)): labels, params, score = results[method] if labels is not None: unique_labels = set(labels) colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labels))) for label, color in zip(unique_labels, colors): mask = labels == label axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], c=[color], s=50, alpha=0.7, label=f'Cluster {label}') axes[idx].set_title(f'{title}\nn_components={params["n_components"]}, ' f'cov={params["covariance_type"]}') else: axes[idx].text(0.5, 0.5, 'No valid clustering', ha='center', va='center', transform=axes[idx].transAxes, fontsize=12) axes[idx].set_title(f'{title}\n(Failed)') axes[idx].set_xlabel('PCA Component 1') axes[idx].set_ylabel('PCA Component 2') axes[idx].grid(True, alpha=0.3) plt.tight_layout() plt.savefig('gmm_clustering_results.png', dpi=300, bbox_inches='tight') plt.show() print(f"Visualization saved as 'gmm_clustering_results.png'") def save_clustering_results(self, results): """Save final clustering results to JSON files""" timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") for method in ['bic', 'aic', 'silhouette']: labels, params, score = results[method] if labels is not None: clustering_results = [] for filepath, label in zip(self.file_paths, labels): clustering_results.append({ "filepath": filepath, "cluster": int(label) }) filename = f"gmm_final_results_{method}_{timestamp}.json" with open(filename, 'w') as f: json.dump({ "method": f"GMM (best by {method.upper()})", "parameters": params, "n_components": params['n_components'], "n_samples": len(labels), f"{method}_score": score, "results": clustering_results }, f, indent=4) print(f"Final clustering results ({method}) saved to: {filename}") def main(): parser = argparse.ArgumentParser(description="Run extensive Gaussian Mixture Model clustering on document embeddings") parser.add_argument("--embeddings_path", required=True, help="Path to embeddings JSON file") args = parser.parse_args() # Initialize clustering clustering = GMMExtensiveClustering(args.embeddings_path) # Run extensive grid search results = clustering.run_gmm_grid_search() if any(labels is not None for labels, _, _ in results.values()): # Visualize and save results clustering.visualize_results(results) clustering.save_clustering_results(results) print("\nGMM extensive clustering completed successfully!") else: print("\nGMM extensive clustering did not find suitable clusters.") if __name__ == "__main__": main()