update source code and pipeline

2025-09-04 14:39:02 +00:00
parent 9aabd991c5
commit 878310a551
82 changed files with 24373 additions and 0 deletions
--- a/cluster/gmm_extensive.py
+++ b/cluster/gmm_extensive.py
@@ -0,0 +1,649 @@
+#!/usr/bin/env python3
+"""
+Extensive Gaussian Mixture Model clustering with grid search for optimal parameters
+Includes BIC and AIC metrics for model selection
+"""
+
+import json
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
+from sklearn.decomposition import PCA
+import datetime
+import csv
+import argparse
+import warnings
+warnings.filterwarnings('ignore')
+
+class GMMExtensiveClustering:
+    def __init__(self, embeddings_path):
+        self.embeddings_path = embeddings_path
+        self.embeddings = None
+        self.file_paths = None
+        self.load_embeddings()
+        
+    def load_embeddings(self):
+        """Load embeddings from JSON file"""
+        print(f"Loading embeddings from {self.embeddings_path}...")
+        with open(self.embeddings_path, 'r') as f:
+            data = json.load(f)
+        
+        self.file_paths = []
+        embeddings_list = []
+        
+        for item in data:
+            self.file_paths.append(item['filepath'])
+            embeddings_list.append(item['embedding'])
+        
+        self.embeddings = np.array(embeddings_list, dtype=np.float32)
+        print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
+        
+        # Standardize embeddings for better clustering
+        self.scaler = StandardScaler()
+        self.embeddings_scaled = self.scaler.fit_transform(self.embeddings)
+    
+    def run_gmm_grid_search(self):
+        """Run GMM with optimized grid search for faster execution"""
+        print("\n" + "="*70)
+        print("RUNNING GAUSSIAN MIXTURE MODEL CLUSTERING WITH OPTIMIZED GRID SEARCH")
+        print("="*70)
+        
+        # Optimized GMM parameter candidates for faster execution
+        
+        # Smart n_components range with larger steps
+        max_components = min(50, len(self.embeddings_scaled) // 20)  # Reduced max and increased divisor
+        n_components_candidates = []
+        
+        # Progressive step sizes: smaller steps for low numbers, larger for high
+        for n in range(2, max_components + 1):
+            if n <= 5:
+                n_components_candidates.append(n)  # 2, 3, 4, 5
+            elif n <= 10:
+                if n % 2 == 0:  # 6, 8, 10
+                    n_components_candidates.append(n)
+            else:
+                if n % 3 == 2:  # 11, 14, 17, 20
+                    n_components_candidates.append(n)
+        
+        # Reduced covariance types - focus on most important ones
+        covariance_types = [
+            # 'full', 'diag', 
+            'tied', 'spherical'
+            ]  # Removed 'tied' and 'spherical' as they're less common
+        
+        # Simplified regularization - focus on key values
+        reg_covar_candidates = [1e-5, 1e-4, 1e-3]  # Removed extreme values
+        
+        # Reduced n_init - 1 is often sufficient for good initialization methods
+        n_init_candidates = [1, 5]  # Removed 10 to save time
+        
+        # Focus on best initialization methods
+        init_params_candidates = ['kmeans', 'k-means++']  # Removed 'random' and 'random_from_data'
+        
+        # Simplified max_iter - most problems converge quickly
+        max_iter_candidates = [100, 300]  # Removed 500, added 300 as middle ground
+        
+        print(f"Optimized parameter combinations:")
+        print(f"  - n_components: {len(n_components_candidates)} values {n_components_candidates}")
+        print(f"  - covariance_types: {len(covariance_types)} options {covariance_types}")
+        print(f"  - reg_covar: {len(reg_covar_candidates)} values {reg_covar_candidates}")
+        print(f"  - n_init: {len(n_init_candidates)} values {n_init_candidates}")
+        print(f"  - init_params: {len(init_params_candidates)} options {init_params_candidates}")
+        print(f"  - max_iter: {len(max_iter_candidates)} values {max_iter_candidates}")
+        
+        total_combinations = (len(n_components_candidates) * len(covariance_types) * 
+                            len(reg_covar_candidates) * len(n_init_candidates) * 
+                            len(init_params_candidates) * len(max_iter_candidates))
+        print(f"Total combinations: {total_combinations} (optimized for speed)")
+        
+        # Estimate time
+        estimated_time_per_combination = 0.5  # seconds (conservative estimate)
+        estimated_total_time = total_combinations * estimated_time_per_combination
+        print(f"Estimated runtime: {estimated_total_time/60:.1f} minutes")
+        print("This should be much faster...\n")
+        
+        # Track all results for analysis
+        all_results = []
+        
+        # Early stopping criteria for speed optimization
+        early_stopping_threshold = 0.7  # If we find a very good silhouette score, we can be less exhaustive
+        good_results_found = 0
+        max_good_results = 5  # Stop early if we find several very good results
+        
+        best_bic_score = float('inf')
+        best_aic_score = float('inf')
+        best_silhouette_score = -1
+        best_params_bic = None
+        best_params_aic = None
+        best_params_silhouette = None
+        best_labels_bic = None
+        best_labels_aic = None
+        best_labels_silhouette = None
+        
+        current_combination = 0
+        
+        # Optimized iteration order: test simpler models first (fewer components, simpler covariance)
+        for covariance_type in covariance_types:  # Start with covariance type
+            for n_components in n_components_candidates:  # Then components
+                for init_params in init_params_candidates:  # Good initialization methods
+                    for reg_covar in reg_covar_candidates:  # Regularization
+                        for n_init in n_init_candidates:  # Number of initializations
+                            for max_iter in max_iter_candidates:  # Iterations last
+                                current_combination += 1
+                                
+                                # Progress indicator with time estimation
+                                if current_combination % 50 == 0 or current_combination == total_combinations:
+                                    progress = (current_combination / total_combinations) * 100
+                                    print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%) - "
+                                          f"Best scores so far: BIC={best_bic_score:.2f}, Silhouette={best_silhouette_score:.3f}")
+                                
+                                try:
+                                    # Early convergence check for faster models
+                                    tol = 1e-3 if n_components <= 5 else 1e-4  # Less strict tolerance for simple models
+                                    
+                                    # Run GMM
+                                    gmm = GaussianMixture(
+                                        n_components=n_components,
+                                        covariance_type=covariance_type,
+                                        reg_covar=reg_covar,
+                                        n_init=n_init,
+                                        init_params=init_params,
+                                        max_iter=max_iter,
+                                        tol=tol,  # Added tolerance for faster convergence
+                                        random_state=42
+                                    )
+                                    
+                                    # Fit and predict
+                                    gmm.fit(self.embeddings_scaled)
+                                    labels = gmm.predict(self.embeddings_scaled)
+                                    
+                                    # Quick validation - skip if model didn't converge properly
+                                    if not gmm.converged_ and max_iter <= 100:
+                                        continue  # Skip non-converged simple models
+                                    
+                                    # Calculate metrics
+                                    bic_score = gmm.bic(self.embeddings_scaled)
+                                    aic_score = gmm.aic(self.embeddings_scaled)
+                                    log_likelihood = gmm.score(self.embeddings_scaled)
+                                    
+                                    # Only calculate clustering metrics if we have multiple clusters
+                                    if len(set(labels)) > 1:
+                                        silhouette = silhouette_score(self.embeddings_scaled, labels)
+                                        calinski_harabasz = calinski_harabasz_score(self.embeddings_scaled, labels)
+                                        davies_bouldin = davies_bouldin_score(self.embeddings_scaled, labels)
+                                        
+                                        # Early stopping check
+                                        if silhouette > early_stopping_threshold:
+                                            good_results_found += 1
+                                            print(f"🎯 Excellent result found: n_comp={n_components}, cov={covariance_type}, "
+                                                  f"silhouette={silhouette:.4f}")
+                                            
+                                    else:
+                                        silhouette = -1
+                                        calinski_harabasz = 0
+                                        davies_bouldin = float('inf')
+                                    
+                                    # Store result for analysis
+                                    result_info = {
+                                        'n_components': n_components,
+                                        'covariance_type': covariance_type,
+                                        'reg_covar': reg_covar,
+                                        'n_init': n_init,
+                                        'init_params': init_params,
+                                        'max_iter': max_iter,
+                                        'bic_score': bic_score,
+                                        'aic_score': aic_score,
+                                        'log_likelihood': log_likelihood,
+                                        'silhouette_score': silhouette,
+                                        'calinski_harabasz_score': calinski_harabasz,
+                                        'davies_bouldin_score': davies_bouldin,
+                                        'converged': gmm.converged_,
+                                        'n_iter': gmm.n_iter_,
+                                        'unique_clusters': len(set(labels))
+                                    }
+                                    
+                                    all_results.append(result_info)
+                                    
+                                    # Print promising results
+                                    if (silhouette > 0.3 and bic_score < np.percentile([r['bic_score'] for r in all_results], 25)):
+                                        print(f"n_components={n_components}, cov={covariance_type}, init={init_params}: "
+                                              f"BIC={bic_score:.2f}, AIC={aic_score:.2f}, silhouette={silhouette:.4f}")
+                                    
+                                    # Track best results for different criteria
+                                    if bic_score < best_bic_score:
+                                        best_bic_score = bic_score
+                                        best_params_bic = {
+                                            'n_components': n_components,
+                                            'covariance_type': covariance_type,
+                                            'reg_covar': reg_covar,
+                                            'n_init': n_init,
+                                            'init_params': init_params,
+                                            'max_iter': max_iter
+                                        }
+                                        best_labels_bic = labels
+                                    
+                                    if aic_score < best_aic_score:
+                                        best_aic_score = aic_score
+                                        best_params_aic = {
+                                            'n_components': n_components,
+                                            'covariance_type': covariance_type,
+                                            'reg_covar': reg_covar,
+                                            'n_init': n_init,
+                                            'init_params': init_params,
+                                            'max_iter': max_iter
+                                        }
+                                        best_labels_aic = labels
+                                    
+                                    if silhouette > best_silhouette_score and len(set(labels)) > 1:
+                                        best_silhouette_score = silhouette
+                                        best_params_silhouette = {
+                                            'n_components': n_components,
+                                            'covariance_type': covariance_type,
+                                            'reg_covar': reg_covar,
+                                            'n_init': n_init,
+                                            'init_params': init_params,
+                                            'max_iter': max_iter
+                                        }
+                                        best_labels_silhouette = labels
+                                        
+                                    # Early stopping check
+                                    if good_results_found >= 5 and silhouette > 0.6:
+                                        print(f"🛑 Early stopping triggered: Found {good_results_found} excellent results. "
+                                              f"Stopping at {current_combination}/{total_combinations} combinations.")
+                                        break
+                                        
+                                except Exception:
+                                    # Skip problematic parameter combinations
+                                    continue
+                            
+                            # Break from nested loops if early stopping triggered
+                            if good_results_found >= 5 and best_silhouette_score > 0.6:
+                                break
+                        if good_results_found >= 5 and best_silhouette_score > 0.6:
+                            break
+                    if good_results_found >= 5 and best_silhouette_score > 0.6:
+                        break
+                if good_results_found >= 5 and best_silhouette_score > 0.6:
+                    break
+            if good_results_found >= 5 and best_silhouette_score > 0.6:
+                break
+        
+        # Analysis of results
+        print("\n" + "="*70)
+        print("GAUSSIAN MIXTURE MODEL GRID SEARCH ANALYSIS")
+        print("="*70)
+        
+        if all_results:
+            import pandas as pd
+            df_results = pd.DataFrame(all_results)
+            
+            print(f"Total parameter combinations tested: {len(df_results)}")
+            
+            # Filter results with valid clustering (more than 1 cluster)
+            valid_results = df_results[df_results['unique_clusters'] > 1]
+            print(f"Combinations with valid clustering: {len(valid_results)}")
+            
+            if len(valid_results) > 0:
+                # Best scores analysis
+                print(f"\nModel Selection Metrics:")
+                print(f"Best BIC score: {df_results['bic_score'].min():.2f}")
+                print(f"Best AIC score: {df_results['aic_score'].min():.2f}")
+                print(f"Best Log-Likelihood: {df_results['log_likelihood'].max():.2f}")
+                
+                print(f"\nClustering Quality Metrics:")
+                print(f"Best silhouette score: {valid_results['silhouette_score'].max():.4f}")
+                print(f"Mean silhouette score: {valid_results['silhouette_score'].mean():.4f}")
+                print(f"Best Calinski-Harabasz score: {valid_results['calinski_harabasz_score'].max():.2f}")
+                print(f"Best Davies-Bouldin score: {valid_results['davies_bouldin_score'].min():.4f}")
+                
+                # Top results by different criteria
+                print(f"\nTop 5 results by BIC (lower is better):")
+                top_bic = df_results.nsmallest(5, 'bic_score')
+                for idx, row in top_bic.iterrows():
+                    print(f"  n_comp={row['n_components']}, cov={row['covariance_type']}: "
+                          f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}")
+                
+                print(f"\nTop 5 results by AIC (lower is better):")
+                top_aic = df_results.nsmallest(5, 'aic_score')
+                for idx, row in top_aic.iterrows():
+                    print(f"  n_comp={row['n_components']}, cov={row['covariance_type']}: "
+                          f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}")
+                
+                if len(valid_results) > 0:
+                    print(f"\nTop 5 results by Silhouette Score:")
+                    top_silhouette = valid_results.nlargest(5, 'silhouette_score')
+                    for idx, row in top_silhouette.iterrows():
+                        print(f"  n_comp={row['n_components']}, cov={row['covariance_type']}: "
+                              f"silhouette={row['silhouette_score']:.4f}")
+                
+                # Component count analysis
+                component_performance = df_results.groupby('n_components').agg({
+                    'bic_score': 'min',
+                    'aic_score': 'min',
+                    'silhouette_score': 'max'
+                }).reset_index()
+                
+                print(f"\nComponent count analysis (top 10 by BIC):")
+                top_components = component_performance.nsmallest(10, 'bic_score')
+                for idx, row in top_components.iterrows():
+                    print(f"  {row['n_components']} components: "
+                          f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}, "
+                          f"silhouette={row['silhouette_score']:.4f}")
+                
+        print(f"\n📁 SAVING DETAILED RESULTS...")
+        print("="*30)
+        
+        # Save detailed grid search results
+        self.save_gmm_grid_search_results(all_results, 
+                                        best_params_bic, best_bic_score,
+                                        best_params_aic, best_aic_score,
+                                        best_params_silhouette, best_silhouette_score)
+        
+        # Return best results based on BIC (primary), AIC (secondary), Silhouette (tertiary)
+        results = {
+            'bic': (best_labels_bic, best_params_bic, best_bic_score),
+            'aic': (best_labels_aic, best_params_aic, best_aic_score),
+            'silhouette': (best_labels_silhouette, best_params_silhouette, best_silhouette_score)
+        }
+        
+        # Print best results
+        if best_labels_bic is not None:
+            print(f"\nBest GMM result by BIC:")
+            print(f"Parameters: {best_params_bic}")
+            print(f"BIC score: {best_bic_score:.2f}")
+        
+        if best_labels_aic is not None:
+            print(f"\nBest GMM result by AIC:")
+            print(f"Parameters: {best_params_aic}")
+            print(f"AIC score: {best_aic_score:.2f}")
+            
+        if best_labels_silhouette is not None:
+            print(f"\nBest GMM result by Silhouette:")
+            print(f"Parameters: {best_params_silhouette}")
+            print(f"Silhouette score: {best_silhouette_score:.4f}")
+        
+        return results
+    
+    def save_gmm_grid_search_results(self, all_results, 
+                                   best_params_bic, best_bic_score,
+                                   best_params_aic, best_aic_score,
+                                   best_params_silhouette, best_silhouette_score):
+        """Save detailed GMM grid search results to JSON file"""
+        
+        # Prepare comprehensive results data
+        grid_search_data = {
+            "experiment_info": {
+                "timestamp": datetime.datetime.now().isoformat(),
+                "dataset_path": self.embeddings_path,
+                "total_samples": len(self.file_paths),
+                "embedding_dimension": self.embeddings.shape[1],
+                "total_combinations_tested": len(all_results),
+                "method": "Gaussian Mixture Model"
+            },
+            "best_results": {
+                "by_bic": {
+                    "parameters": best_params_bic,
+                    "bic_score": best_bic_score if best_bic_score != float('inf') else None
+                },
+                "by_aic": {
+                    "parameters": best_params_aic,
+                    "aic_score": best_aic_score if best_aic_score != float('inf') else None
+                },
+                "by_silhouette": {
+                    "parameters": best_params_silhouette,
+                    "silhouette_score": best_silhouette_score if best_silhouette_score > -1 else None
+                }
+            },
+            "all_trials": []
+        }
+        
+        # Add all trial results
+        for i, result in enumerate(all_results):
+            trial_data = {
+                "trial_id": i + 1,
+                "parameters": {
+                    "n_components": result['n_components'],
+                    "covariance_type": result['covariance_type'],
+                    "reg_covar": result['reg_covar'],
+                    "n_init": result['n_init'],
+                    "init_params": result['init_params'],
+                    "max_iter": result['max_iter']
+                },
+                "results": {
+                    "bic_score": result['bic_score'],
+                    "aic_score": result['aic_score'],
+                    "log_likelihood": result['log_likelihood'],
+                    "silhouette_score": result['silhouette_score'],
+                    "calinski_harabasz_score": result['calinski_harabasz_score'],
+                    "davies_bouldin_score": result['davies_bouldin_score'],
+                    "converged": result['converged'],
+                    "n_iter": result['n_iter'],
+                    "unique_clusters": result['unique_clusters']
+                }
+            }
+            grid_search_data["all_trials"].append(trial_data)
+        
+        # Calculate summary statistics
+        if all_results:
+            bic_scores = [r['bic_score'] for r in all_results]
+            aic_scores = [r['aic_score'] for r in all_results]
+            log_likelihoods = [r['log_likelihood'] for r in all_results]
+            
+            valid_silhouette = [r['silhouette_score'] for r in all_results if r['silhouette_score'] > -1]
+            
+            grid_search_data["summary_statistics"] = {
+                "total_trials": len(all_results),
+                "valid_clustering_trials": len(valid_silhouette),
+                "bic_score": {
+                    "best": min(bic_scores),
+                    "worst": max(bic_scores),
+                    "mean": sum(bic_scores) / len(bic_scores),
+                    "median": sorted(bic_scores)[len(bic_scores)//2]
+                },
+                "aic_score": {
+                    "best": min(aic_scores),
+                    "worst": max(aic_scores),
+                    "mean": sum(aic_scores) / len(aic_scores),
+                    "median": sorted(aic_scores)[len(aic_scores)//2]
+                },
+                "log_likelihood": {
+                    "best": max(log_likelihoods),
+                    "worst": min(log_likelihoods),
+                    "mean": sum(log_likelihoods) / len(log_likelihoods)
+                }
+            }
+            
+            if valid_silhouette:
+                grid_search_data["summary_statistics"]["silhouette_score"] = {
+                    "best": max(valid_silhouette),
+                    "worst": min(valid_silhouette),
+                    "mean": sum(valid_silhouette) / len(valid_silhouette),
+                    "median": sorted(valid_silhouette)[len(valid_silhouette)//2]
+                }
+            
+            # Top 10 results by different criteria
+            sorted_by_bic = sorted(all_results, key=lambda x: x['bic_score'])
+            sorted_by_aic = sorted(all_results, key=lambda x: x['aic_score'])
+            valid_results = [r for r in all_results if r['silhouette_score'] > -1]
+            sorted_by_silhouette = sorted(valid_results, key=lambda x: x['silhouette_score'], reverse=True)
+            
+            grid_search_data["top_10_results"] = {
+                "by_bic": [],
+                "by_aic": [],
+                "by_silhouette": []
+            }
+            
+            for i, result in enumerate(sorted_by_bic[:10]):
+                grid_search_data["top_10_results"]["by_bic"].append({
+                    "rank": i + 1,
+                    "parameters": {
+                        "n_components": result['n_components'],
+                        "covariance_type": result['covariance_type'],
+                        "init_params": result['init_params']
+                    },
+                    "bic_score": result['bic_score'],
+                    "aic_score": result['aic_score']
+                })
+            
+            for i, result in enumerate(sorted_by_aic[:10]):
+                grid_search_data["top_10_results"]["by_aic"].append({
+                    "rank": i + 1,
+                    "parameters": {
+                        "n_components": result['n_components'],
+                        "covariance_type": result['covariance_type'],
+                        "init_params": result['init_params']
+                    },
+                    "bic_score": result['bic_score'],
+                    "aic_score": result['aic_score']
+                })
+            
+            for i, result in enumerate(sorted_by_silhouette[:10]):
+                grid_search_data["top_10_results"]["by_silhouette"].append({
+                    "rank": i + 1,
+                    "parameters": {
+                        "n_components": result['n_components'],
+                        "covariance_type": result['covariance_type'],
+                        "init_params": result['init_params']
+                    },
+                    "silhouette_score": result['silhouette_score']
+                })
+        
+        # Save to file with timestamp
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"gmm_grid_search_detailed_{timestamp}.json"
+        # print()
+        
+        # with open(filename, 'w') as f:
+        #     json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
+        
+        print(f"Detailed grid search results saved to: {filename}")
+        
+        # Also save a CSV summary for easy analysis
+        csv_filename = f"gmm_grid_search_summary_{timestamp}.csv"
+        self.save_grid_search_csv(all_results, csv_filename)
+        print(f"Grid search summary CSV saved to: {csv_filename}")
+    
+    def save_grid_search_csv(self, all_results, filename):
+        """Save grid search results as CSV for easy analysis"""
+        
+        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['trial_id', 'n_components', 'covariance_type', 'reg_covar', 
+                         'n_init', 'init_params', 'max_iter', 'bic_score', 'aic_score',
+                         'log_likelihood', 'silhouette_score', 'calinski_harabasz_score',
+                         'davies_bouldin_score', 'converged', 'n_iter', 'unique_clusters']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            
+            writer.writeheader()
+            for i, result in enumerate(all_results):
+                writer.writerow({
+                    'trial_id': i + 1,
+                    'n_components': result['n_components'],
+                    'covariance_type': result['covariance_type'],
+                    'reg_covar': result['reg_covar'],
+                    'n_init': result['n_init'],
+                    'init_params': result['init_params'],
+                    'max_iter': result['max_iter'],
+                    'bic_score': result['bic_score'],
+                    'aic_score': result['aic_score'],
+                    'log_likelihood': result['log_likelihood'],
+                    'silhouette_score': result['silhouette_score'],
+                    'calinski_harabasz_score': result['calinski_harabasz_score'],
+                    'davies_bouldin_score': result['davies_bouldin_score'],
+                    'converged': result['converged'],
+                    'n_iter': result['n_iter'],
+                    'unique_clusters': result['unique_clusters']
+                })
+    
+    def visualize_results(self, results):
+        """Visualize clustering results using PCA"""
+        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
+        
+        # Reduce dimensions for visualization
+        pca = PCA(n_components=2, random_state=42)
+        embeddings_2d = pca.fit_transform(self.embeddings_scaled)
+        
+        methods = ['bic', 'aic', 'silhouette']
+        titles = ['Best by BIC', 'Best by AIC', 'Best by Silhouette']
+        
+        for idx, (method, title) in enumerate(zip(methods, titles)):
+            labels, params, score = results[method]
+            
+            if labels is not None:
+                unique_labels = set(labels)
+                colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labels)))
+                
+                for label, color in zip(unique_labels, colors):
+                    mask = labels == label
+                    axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], 
+                                    c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
+                
+                axes[idx].set_title(f'{title}\nn_components={params["n_components"]}, '
+                                   f'cov={params["covariance_type"]}')
+            else:
+                axes[idx].text(0.5, 0.5, 'No valid clustering', ha='center', va='center',
+                              transform=axes[idx].transAxes, fontsize=12)
+                axes[idx].set_title(f'{title}\n(Failed)')
+            
+            axes[idx].set_xlabel('PCA Component 1')
+            axes[idx].set_ylabel('PCA Component 2')
+            axes[idx].grid(True, alpha=0.3)
+        
+        plt.tight_layout()
+        plt.savefig('gmm_clustering_results.png', dpi=300, bbox_inches='tight')
+        plt.show()
+        
+        print(f"Visualization saved as 'gmm_clustering_results.png'")
+    
+    def save_clustering_results(self, results):
+        """Save final clustering results to JSON files"""
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        
+        for method in ['bic', 'aic', 'silhouette']:
+            labels, params, score = results[method]
+            
+            if labels is not None:
+                clustering_results = []
+                for filepath, label in zip(self.file_paths, labels):
+                    clustering_results.append({
+                        "filepath": filepath,
+                        "cluster": int(label)
+                    })
+                
+                filename = f"gmm_final_results_{method}_{timestamp}.json"
+                
+                with open(filename, 'w') as f:
+                    json.dump({
+                        "method": f"GMM (best by {method.upper()})",
+                        "parameters": params,
+                        "n_components": params['n_components'],
+                        "n_samples": len(labels),
+                        f"{method}_score": score,
+                        "results": clustering_results
+                    }, f, indent=4)
+                
+                print(f"Final clustering results ({method}) saved to: {filename}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Run extensive Gaussian Mixture Model clustering on document embeddings")
+    parser.add_argument("--embeddings_path", required=True, help="Path to embeddings JSON file")
+    
+    args = parser.parse_args()
+    
+    # Initialize clustering
+    clustering = GMMExtensiveClustering(args.embeddings_path)
+    
+    # Run extensive grid search
+    results = clustering.run_gmm_grid_search()
+    
+    if any(labels is not None for labels, _, _ in results.values()):
+        # Visualize and save results
+        clustering.visualize_results(results)
+        clustering.save_clustering_results(results)
+        print("\nGMM extensive clustering completed successfully!")
+    else:
+        print("\nGMM extensive clustering did not find suitable clusters.")
+
+if __name__ == "__main__":
+    main()