update source code and pipeline
This commit is contained in:
670
cluster/auto_cluster copy.py
Normal file
670
cluster/auto_cluster copy.py
Normal file
@@ -0,0 +1,670 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation)
|
||||
These methods don't require specifying the number of clusters beforehand.
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import silhouette_score
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.decomposition import PCA
|
||||
import argparse
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
class AutoClustering:
|
||||
def __init__(self, embeddings_path):
|
||||
self.embeddings_path = embeddings_path
|
||||
self.embeddings = None
|
||||
self.file_paths = None
|
||||
self.load_embeddings()
|
||||
|
||||
def load_embeddings(self):
|
||||
"""Load embeddings from JSON file"""
|
||||
print(f"Loading embeddings from {self.embeddings_path}...")
|
||||
with open(self.embeddings_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.file_paths = []
|
||||
embeddings_list = []
|
||||
|
||||
for item in data:
|
||||
self.file_paths.append(item['filepath'])
|
||||
embeddings_list.append(item['embedding'])
|
||||
|
||||
self.embeddings = np.array(embeddings_list, dtype=np.float32)
|
||||
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
|
||||
|
||||
# Standardize embeddings for better clustering
|
||||
self.scaler = StandardScaler()
|
||||
self.embeddings_scaled = self.scaler.fit_transform(self.embeddings)
|
||||
|
||||
def run_dbscan(self):
|
||||
"""Run DBSCAN with extensive grid search for parameter estimation"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH")
|
||||
print("="*50)
|
||||
|
||||
# Method 1: K-nearest neighbors approach with multiple k values
|
||||
eps_candidates = []
|
||||
|
||||
# Try different k values for nearest neighbors
|
||||
k_values = [5, 10, 15, 20, 25, 30]
|
||||
for k in k_values:
|
||||
k_actual = min(k, len(self.embeddings_scaled) // 4)
|
||||
if k_actual < 3:
|
||||
continue
|
||||
|
||||
neighbors = NearestNeighbors(n_neighbors=k_actual)
|
||||
neighbors_fit = neighbors.fit(self.embeddings_scaled)
|
||||
distances, indices = neighbors_fit.kneighbors(self.embeddings_scaled)
|
||||
|
||||
# Sort distances and use k-th nearest neighbor distance
|
||||
distances = np.sort(distances, axis=0)
|
||||
kth_distances = distances[:, k_actual-1]
|
||||
|
||||
# Multiple percentile thresholds for each k
|
||||
percentiles = [60, 65, 70, 75, 80, 85, 90, 95]
|
||||
for p in percentiles:
|
||||
eps_candidates.append(np.percentile(kth_distances, p))
|
||||
|
||||
# Method 2: Statistical measures
|
||||
# Mean and std of pairwise distances (sampled for efficiency)
|
||||
sample_size = min(1000, len(self.embeddings_scaled))
|
||||
sample_indices = np.random.choice(len(self.embeddings_scaled), sample_size, replace=False)
|
||||
sample_data = self.embeddings_scaled[sample_indices]
|
||||
|
||||
from scipy.spatial.distance import pdist
|
||||
pairwise_distances = pdist(sample_data)
|
||||
|
||||
# Add statistical measures as eps candidates
|
||||
eps_candidates.extend([
|
||||
np.mean(pairwise_distances) * 0.3,
|
||||
np.mean(pairwise_distances) * 0.4,
|
||||
np.mean(pairwise_distances) * 0.5,
|
||||
np.mean(pairwise_distances) * 0.6,
|
||||
np.mean(pairwise_distances) * 0.7,
|
||||
np.median(pairwise_distances) * 0.3,
|
||||
np.median(pairwise_distances) * 0.4,
|
||||
np.median(pairwise_distances) * 0.5,
|
||||
np.median(pairwise_distances) * 0.6,
|
||||
np.std(pairwise_distances) * 0.5,
|
||||
np.std(pairwise_distances) * 0.8,
|
||||
np.std(pairwise_distances) * 1.0,
|
||||
np.std(pairwise_distances) * 1.2
|
||||
])
|
||||
|
||||
# Method 3: Manual eps values for different scales
|
||||
manual_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
|
||||
1.2, 1.5, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0]
|
||||
eps_candidates.extend(manual_eps)
|
||||
|
||||
# Remove duplicates and invalid values, then sort
|
||||
eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0])))
|
||||
|
||||
# Extensive min_samples candidates
|
||||
min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50]
|
||||
|
||||
# Filter min_samples based on dataset size
|
||||
# max_min_samples = len(self.embeddings_scaled) // 10 # At most 10% of data
|
||||
# min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples]
|
||||
min_samples_candidates = [60]
|
||||
|
||||
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_labels = None
|
||||
|
||||
print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values")
|
||||
print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}")
|
||||
print("This may take a while...\n")
|
||||
|
||||
# Track all results for analysis
|
||||
all_results = []
|
||||
|
||||
total_combinations = len(eps_candidates) * len(min_samples_candidates)
|
||||
current_combination = 0
|
||||
|
||||
for eps in eps_candidates:
|
||||
for min_samples in min_samples_candidates:
|
||||
current_combination += 1
|
||||
|
||||
# Progress indicator
|
||||
if current_combination % 50 == 0 or current_combination == total_combinations:
|
||||
progress = (current_combination / total_combinations) * 100
|
||||
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)")
|
||||
|
||||
try:
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||||
labels = dbscan.fit_predict(self.embeddings_scaled)
|
||||
|
||||
# Check if we have meaningful clusters
|
||||
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
n_noise = list(labels).count(-1)
|
||||
noise_ratio = n_noise / len(labels)
|
||||
|
||||
# Store result for analysis
|
||||
result_info = {
|
||||
'eps': eps,
|
||||
'min_samples': min_samples,
|
||||
'n_clusters': n_clusters,
|
||||
'n_noise': n_noise,
|
||||
'noise_ratio': noise_ratio
|
||||
}
|
||||
|
||||
# Check if we have meaningful clusters
|
||||
if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points
|
||||
# Calculate silhouette score (excluding noise)
|
||||
mask = labels != -1
|
||||
if np.sum(mask) > 1:
|
||||
try:
|
||||
score = silhouette_score(self.embeddings_scaled[mask], labels[mask])
|
||||
result_info['silhouette_score'] = score
|
||||
|
||||
# Print promising results
|
||||
if score > 0.1: # Only show decent scores
|
||||
print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_params = (eps, min_samples)
|
||||
best_labels = labels
|
||||
except Exception:
|
||||
result_info['silhouette_score'] = None
|
||||
else:
|
||||
result_info['silhouette_score'] = None
|
||||
|
||||
all_results.append(result_info)
|
||||
|
||||
except Exception as e:
|
||||
# Skip problematic parameter combinations
|
||||
continue
|
||||
|
||||
# Analysis of results
|
||||
print("\n" + "="*50)
|
||||
print("DBSCAN GRID SEARCH ANALYSIS")
|
||||
print("="*50)
|
||||
|
||||
if all_results:
|
||||
# Convert to numpy for easier analysis
|
||||
import pandas as pd
|
||||
df_results = pd.DataFrame(all_results)
|
||||
|
||||
print(f"Total parameter combinations tested: {len(df_results)}")
|
||||
|
||||
# Valid results (with clusters)
|
||||
valid_results = df_results[df_results['n_clusters'] >= 2]
|
||||
print(f"Combinations that produced clusters: {len(valid_results)}")
|
||||
|
||||
if len(valid_results) > 0:
|
||||
# Best silhouette scores
|
||||
scored_results = valid_results.dropna(subset=['silhouette_score'])
|
||||
if len(scored_results) > 0:
|
||||
print(f"Combinations with valid silhouette scores: {len(scored_results)}")
|
||||
print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}")
|
||||
print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}")
|
||||
|
||||
# Top 5 results
|
||||
top_results = scored_results.nlargest(5, 'silhouette_score')
|
||||
print("\nTop 5 parameter combinations:")
|
||||
for idx, row in top_results.iterrows():
|
||||
print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: "
|
||||
f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}")
|
||||
|
||||
# Cluster count distribution
|
||||
cluster_counts = valid_results['n_clusters'].value_counts().sort_index()
|
||||
print(f"\nCluster count distribution:")
|
||||
for n_clusters, count in cluster_counts.items():
|
||||
print(f" {n_clusters} clusters: {count} parameter combinations")
|
||||
|
||||
print(f"\n📁 SAVING DETAILED RESULTS...")
|
||||
print("="*30)
|
||||
|
||||
# Save detailed grid search results to JSON file
|
||||
self.save_dbscan_grid_search_results(all_results, best_params, best_score)
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0)
|
||||
n_noise = list(best_labels).count(-1)
|
||||
|
||||
print(f"\nBest DBSCAN result:")
|
||||
print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("DBSCAN could not find suitable clusters with the extensive grid search")
|
||||
print("Consider:")
|
||||
print("- Adjusting the embedding space (different model or preprocessing)")
|
||||
print("- Using different clustering algorithms")
|
||||
print("- Manual parameter tuning based on domain knowledge")
|
||||
return None
|
||||
|
||||
def save_dbscan_grid_search_results(self, all_results, best_params, best_score):
|
||||
"""Save detailed DBSCAN grid search results to JSON file"""
|
||||
import datetime
|
||||
|
||||
# Prepare comprehensive results data
|
||||
grid_search_data = {
|
||||
"experiment_info": {
|
||||
"timestamp": datetime.datetime.now().isoformat(),
|
||||
"dataset_path": self.embeddings_path,
|
||||
"total_samples": len(self.file_paths),
|
||||
"embedding_dimension": self.embeddings.shape[1],
|
||||
"total_combinations_tested": len(all_results)
|
||||
},
|
||||
"best_result": {
|
||||
"eps": best_params[0] if best_params else None,
|
||||
"min_samples": best_params[1] if best_params else None,
|
||||
"silhouette_score": best_score if best_score > -1 else None
|
||||
},
|
||||
"all_trials": []
|
||||
}
|
||||
|
||||
# Add all trial results
|
||||
for i, result in enumerate(all_results):
|
||||
trial_data = {
|
||||
"trial_id": i + 1,
|
||||
"parameters": {
|
||||
"eps": result['eps'],
|
||||
"min_samples": result['min_samples']
|
||||
},
|
||||
"results": {
|
||||
"n_clusters": result['n_clusters'],
|
||||
"n_noise": result['n_noise'],
|
||||
"noise_ratio": result['noise_ratio'],
|
||||
"silhouette_score": result['silhouette_score']
|
||||
},
|
||||
"status": "success" if result['silhouette_score'] is not None else "failed"
|
||||
}
|
||||
grid_search_data["all_trials"].append(trial_data)
|
||||
|
||||
# Calculate summary statistics
|
||||
valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"]
|
||||
if valid_trials:
|
||||
silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None]
|
||||
if silhouette_scores:
|
||||
grid_search_data["summary_statistics"] = {
|
||||
"total_trials": len(all_results),
|
||||
"successful_trials": len(valid_trials),
|
||||
"success_rate": len(valid_trials) / len(all_results),
|
||||
"best_silhouette_score": max(silhouette_scores),
|
||||
"worst_silhouette_score": min(silhouette_scores),
|
||||
"mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores),
|
||||
"median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2]
|
||||
}
|
||||
|
||||
# Top 10 results
|
||||
sorted_valid_trials = sorted(valid_trials,
|
||||
key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1,
|
||||
reverse=True)
|
||||
grid_search_data["top_10_results"] = sorted_valid_trials[:10]
|
||||
|
||||
# Parameter analysis
|
||||
eps_values = [t["parameters"]["eps"] for t in valid_trials]
|
||||
min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials]
|
||||
|
||||
grid_search_data["parameter_analysis"] = {
|
||||
"eps_range": {
|
||||
"min": min(eps_values),
|
||||
"max": max(eps_values),
|
||||
"mean": sum(eps_values) / len(eps_values)
|
||||
},
|
||||
"min_samples_range": {
|
||||
"min": min(min_samples_values),
|
||||
"max": max(min_samples_values),
|
||||
"mean": sum(min_samples_values) / len(min_samples_values)
|
||||
}
|
||||
}
|
||||
|
||||
# Save to file with timestamp
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"dbscan_grid_search_detailed_{timestamp}.json"
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Detailed grid search results saved to: {filename}")
|
||||
|
||||
# Also save a CSV summary for easy analysis
|
||||
csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv"
|
||||
self.save_grid_search_csv(all_results, csv_filename)
|
||||
print(f"Grid search summary CSV saved to: {csv_filename}")
|
||||
|
||||
def save_grid_search_csv(self, all_results, filename):
|
||||
"""Save grid search results as CSV for easy analysis"""
|
||||
import csv
|
||||
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise',
|
||||
'noise_ratio', 'silhouette_score', 'status']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for i, result in enumerate(all_results):
|
||||
writer.writerow({
|
||||
'trial_id': i + 1,
|
||||
'eps': result['eps'],
|
||||
'min_samples': result['min_samples'],
|
||||
'n_clusters': result['n_clusters'],
|
||||
'n_noise': result['n_noise'],
|
||||
'noise_ratio': result['noise_ratio'],
|
||||
'silhouette_score': result['silhouette_score'],
|
||||
'status': 'success' if result['silhouette_score'] is not None else 'failed'
|
||||
})
|
||||
|
||||
def run_mean_shift(self):
|
||||
"""Run Mean Shift clustering"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING MEAN SHIFT CLUSTERING")
|
||||
print("="*50)
|
||||
|
||||
# Estimate bandwidth using different percentiles
|
||||
from sklearn.cluster import estimate_bandwidth
|
||||
|
||||
# Try different bandwidth estimation methods
|
||||
bandwidth_candidates = []
|
||||
|
||||
# Method 1: sklearn's estimate_bandwidth
|
||||
try:
|
||||
bw_est = estimate_bandwidth(self.embeddings_scaled, quantile=0.3, n_samples=min(500, len(self.embeddings_scaled)))
|
||||
if bw_est > 0:
|
||||
bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5])
|
||||
except:
|
||||
pass
|
||||
|
||||
# Method 2: nearest neighbor distances
|
||||
neighbors = NearestNeighbors(n_neighbors=10)
|
||||
neighbors_fit = neighbors.fit(self.embeddings_scaled)
|
||||
distances, _ = neighbors_fit.kneighbors(self.embeddings_scaled)
|
||||
mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance
|
||||
bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5])
|
||||
|
||||
# Remove duplicates and invalid values
|
||||
bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0]))
|
||||
|
||||
if not bandwidth_candidates:
|
||||
bandwidth_candidates = [0.5, 1.0, 1.5, 2.0]
|
||||
|
||||
best_score = -1
|
||||
best_bandwidth = None
|
||||
best_labels = None
|
||||
|
||||
print("Testing different bandwidth values...")
|
||||
for bandwidth in bandwidth_candidates:
|
||||
try:
|
||||
mean_shift = MeanShift(bandwidth=bandwidth)
|
||||
labels = mean_shift.fit_predict(self.embeddings_scaled)
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
|
||||
score = silhouette_score(self.embeddings_scaled, labels)
|
||||
print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_bandwidth = bandwidth
|
||||
best_labels = labels
|
||||
except Exception as e:
|
||||
print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)")
|
||||
continue
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels))
|
||||
print(f"\nBest Mean Shift result:")
|
||||
print(f"Bandwidth: {best_bandwidth:.4f}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("Mean Shift could not find suitable clusters")
|
||||
return None
|
||||
|
||||
def run_affinity_propagation(self):
|
||||
"""Run Affinity Propagation clustering"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING AFFINITY PROPAGATION CLUSTERING")
|
||||
print("="*50)
|
||||
|
||||
# Calculate similarity matrix
|
||||
similarities = -np.sum((self.embeddings_scaled[:, np.newaxis] - self.embeddings_scaled)**2, axis=2)
|
||||
|
||||
# Try different preference values (percentiles of similarity matrix diagonal)
|
||||
preference_candidates = [
|
||||
np.percentile(similarities.diagonal(), 10),
|
||||
np.percentile(similarities.diagonal(), 25),
|
||||
np.percentile(similarities.diagonal(), 50),
|
||||
np.median(similarities),
|
||||
np.percentile(similarities.diagonal(), 75)
|
||||
]
|
||||
|
||||
damping_candidates = [0.5, 0.7, 0.8, 0.9]
|
||||
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_labels = None
|
||||
|
||||
print("Testing different parameter combinations...")
|
||||
for preference in preference_candidates:
|
||||
for damping in damping_candidates:
|
||||
try:
|
||||
affinity_prop = AffinityPropagation(
|
||||
preference=preference,
|
||||
damping=damping,
|
||||
random_state=42,
|
||||
max_iter=200
|
||||
)
|
||||
labels = affinity_prop.fit_predict(self.embeddings_scaled)
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
|
||||
score = silhouette_score(self.embeddings_scaled, labels)
|
||||
print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_params = (preference, damping)
|
||||
best_labels = labels
|
||||
except Exception as e:
|
||||
print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)")
|
||||
continue
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels))
|
||||
print(f"\nBest Affinity Propagation result:")
|
||||
print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("Affinity Propagation could not find suitable clusters")
|
||||
return None
|
||||
|
||||
def visualize_results(self, results_dict):
|
||||
"""Visualize clustering results using PCA"""
|
||||
if not results_dict:
|
||||
print("No results to visualize")
|
||||
return
|
||||
|
||||
# Reduce dimensions for visualization
|
||||
pca = PCA(n_components=2, random_state=42)
|
||||
embeddings_2d = pca.fit_transform(self.embeddings_scaled)
|
||||
|
||||
n_methods = len(results_dict)
|
||||
fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4))
|
||||
|
||||
if n_methods == 1:
|
||||
axes = [axes]
|
||||
|
||||
for idx, (method_name, labels) in enumerate(results_dict.items()):
|
||||
# Handle noise points in DBSCAN (label -1)
|
||||
unique_labels = set(labels)
|
||||
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
|
||||
|
||||
for label, color in zip(unique_labels, colors):
|
||||
if label == -1:
|
||||
# Noise points in black
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c='black', marker='x', s=20, alpha=0.5, label='Noise')
|
||||
else:
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
|
||||
|
||||
axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)')
|
||||
axes[idx].set_xlabel('PCA Component 1')
|
||||
axes[idx].set_ylabel('PCA Component 2')
|
||||
axes[idx].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print(f"\nVisualization saved as 'auto_clustering_results.png'")
|
||||
|
||||
def save_results(self, results_dict):
|
||||
"""Save clustering results to JSON files"""
|
||||
print(results_dict.items())
|
||||
check_method_name = []
|
||||
print(len(results_dict))
|
||||
for method_name, labels in results_dict.items():
|
||||
check_method_name.append(method_name)
|
||||
# Create results for each method
|
||||
method_results = []
|
||||
print(method_name == 'DBSCAN')
|
||||
|
||||
|
||||
|
||||
for filepath, label in zip(self.file_paths, labels):
|
||||
if method_name == 'DBSCAN':
|
||||
if label == -1:
|
||||
is_noise = True
|
||||
else:
|
||||
is_noise = False
|
||||
else:
|
||||
is_noise = False
|
||||
|
||||
method_results.append({
|
||||
"filepath": filepath,
|
||||
"cluster": int(label),
|
||||
"is_noise": is_noise
|
||||
})
|
||||
print('method_name', set(check_method_name))
|
||||
print(method_results[0]['is_noise'])
|
||||
print(method_results[0])
|
||||
|
||||
# Save to file
|
||||
filename = f"{method_name.lower().replace(' ', '_')}_results.json"
|
||||
with open(filename, 'w') as f:
|
||||
json.dump({
|
||||
"method": method_name,
|
||||
"n_clusters": len(set(labels)) - (1 if -1 in labels else 0),
|
||||
"n_samples": len(labels),
|
||||
"results": method_results
|
||||
}, f, indent=4)
|
||||
|
||||
print(f"Results saved to {filename}")
|
||||
|
||||
def run_all_methods(self):
|
||||
"""Run all automatic clustering methods"""
|
||||
print("\n" + "="*70)
|
||||
print("AUTOMATIC CLUSTERING ANALYSIS")
|
||||
print("="*70)
|
||||
print(f"Dataset: {len(self.file_paths)} documents")
|
||||
print(f"Embedding dimension: {self.embeddings.shape[1]}")
|
||||
|
||||
results = {}
|
||||
|
||||
# Run DBSCAN
|
||||
dbscan_labels = self.run_dbscan()
|
||||
if dbscan_labels is not None:
|
||||
results["DBSCAN"] = dbscan_labels
|
||||
|
||||
# Run Mean Shift
|
||||
# meanshift_labels = self.run_mean_shift()
|
||||
# if meanshift_labels is not None:
|
||||
# results["Mean Shift"] = meanshift_labels
|
||||
|
||||
# Run Affinity Propagation
|
||||
# affinity_labels = self.run_affinity_propagation()
|
||||
# if affinity_labels is not None:
|
||||
# results["Affinity Propagation"] = affinity_labels
|
||||
|
||||
# Summary
|
||||
if results:
|
||||
print("\n" + "="*70)
|
||||
print("SUMMARY OF RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for method, labels in results.items():
|
||||
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
if method == "DBSCAN":
|
||||
n_noise = list(labels).count(-1)
|
||||
print(f"{method}: {n_clusters} clusters, {n_noise} noise points")
|
||||
else:
|
||||
print(f"{method}: {n_clusters} clusters")
|
||||
|
||||
# Calculate agreement between methods if multiple succeeded
|
||||
if len(results) > 1:
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
print("\nMethod Agreement (Adjusted Rand Index):")
|
||||
method_names = list(results.keys())
|
||||
for i in range(len(method_names)):
|
||||
for j in range(i+1, len(method_names)):
|
||||
ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]])
|
||||
print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}")
|
||||
|
||||
# Visualize and save results
|
||||
self.visualize_results(results)
|
||||
self.save_results(results)
|
||||
|
||||
else:
|
||||
print("\nNo automatic clustering method found suitable clusters.")
|
||||
print("This might indicate:")
|
||||
print("- Data doesn't have clear cluster structure")
|
||||
print("- Embeddings need different preprocessing")
|
||||
print("- Different parameter ranges needed")
|
||||
|
||||
return results
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings")
|
||||
parser.add_argument("--embeddings_path", help="Path to embeddings JSON file")
|
||||
parser.add_argument("--method", choices=['dbscan', 'meanshift', 'affinity', 'all'], default='all',
|
||||
help="Which automatic method to run")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize clustering
|
||||
clustering = AutoClustering(args.embeddings_path)
|
||||
|
||||
# Run selected method(s)
|
||||
if args.method == 'all':
|
||||
clustering.run_all_methods()
|
||||
elif args.method == 'dbscan':
|
||||
labels = clustering.run_dbscan()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"DBSCAN": labels})
|
||||
clustering.save_results({"DBSCAN": labels})
|
||||
elif args.method == 'meanshift':
|
||||
labels = clustering.run_mean_shift()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"Mean Shift": labels})
|
||||
clustering.save_results({"Mean Shift": labels})
|
||||
elif args.method == 'affinity':
|
||||
labels = clustering.run_affinity_propagation()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"Affinity Propagation": labels})
|
||||
clustering.save_results({"Affinity Propagation": labels})
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
711
cluster/auto_cluster.py
Normal file
711
cluster/auto_cluster.py
Normal file
@@ -0,0 +1,711 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation)
|
||||
These methods don't require specifying the number of clusters beforehand.
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation
|
||||
from sklearn.preprocessing import normalize
|
||||
from sklearn.metrics import silhouette_score
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.decomposition import PCA
|
||||
import argparse
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
def value_counts(a, dropna=False):
|
||||
a_flat = a.ravel()
|
||||
if dropna and np.issubdtype(a.dtype, np.floating):
|
||||
mask = ~np.isnan(a_flat)
|
||||
a_flat = a_flat[mask]
|
||||
uniq, counts = np.unique(a_flat, return_counts=True)
|
||||
uniq = list(uniq)
|
||||
counts = list(counts)
|
||||
return dict(zip(uniq, counts))
|
||||
|
||||
class AutoClustering:
|
||||
def __init__(self, embeddings_path):
|
||||
self.embeddings_path = embeddings_path
|
||||
self.embeddings = None
|
||||
self.file_paths = None
|
||||
self.load_embeddings()
|
||||
|
||||
def load_embeddings(self):
|
||||
"""Load embeddings from JSON file"""
|
||||
print(f"Loading embeddings from {self.embeddings_path}...")
|
||||
with open(self.embeddings_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.file_paths = []
|
||||
embeddings_list = []
|
||||
|
||||
for item in data:
|
||||
self.file_paths.append(item['filepath'])
|
||||
embeddings_list.append(item['embedding'])
|
||||
|
||||
self.embeddings = np.array(embeddings_list, dtype=np.float32)
|
||||
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
|
||||
|
||||
# Normalize embeddings using L2 normalization for cosine distance
|
||||
self.embeddings_normalized = normalize(self.embeddings, norm='l2', axis=1)
|
||||
print("Applied L2 normalization to embeddings")
|
||||
|
||||
sims = cosine_similarity(self.embeddings)
|
||||
print(self.embeddings.shape)
|
||||
# lấy upper triangle exclude diagonal để inspect
|
||||
triu_idxs = np.triu_indices_from(sims, k=1)
|
||||
dist_vals = sims[triu_idxs]
|
||||
print(dist_vals.shape)
|
||||
print("mean sim:", dist_vals.mean(), "std:", dist_vals.std())
|
||||
|
||||
|
||||
def run_dbscan(self):
|
||||
"""Run DBSCAN with extensive grid search for parameter estimation"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH")
|
||||
print("="*50)
|
||||
|
||||
# Method 1: K-nearest neighbors approach with multiple k values
|
||||
# eps_candidates = []
|
||||
|
||||
# # Try different k values for nearest neighbors with cosine metric
|
||||
# k_values = [5, 10, 15, 20, 25, 30]
|
||||
# for k in k_values:
|
||||
# k_actual = min(k, len(self.embeddings_normalized) // 4)
|
||||
# if k_actual < 3:
|
||||
# continue
|
||||
|
||||
# neighbors = NearestNeighbors(n_neighbors=k_actual, metric='cosine')
|
||||
# neighbors_fit = neighbors.fit(self.embeddings_normalized)
|
||||
# distances, indices = neighbors_fit.kneighbors(self.embeddings_normalized)
|
||||
|
||||
# # Sort distances and use k-th nearest neighbor distance
|
||||
# distances = np.sort(distances, axis=0)
|
||||
# kth_distances = distances[:, k_actual-1]
|
||||
|
||||
# # Multiple percentile thresholds for each k
|
||||
# percentiles = [60, 65, 70, 75, 80, 85, 90, 95]
|
||||
# for p in percentiles:
|
||||
# eps_candidates.append(np.percentile(kth_distances, p))
|
||||
|
||||
# # Method 2: Statistical measures using cosine distances
|
||||
# # Calculate cosine distances for a sample of data points
|
||||
# sample_size = min(1000, len(self.embeddings_normalized))
|
||||
# sample_indices = np.random.choice(len(self.embeddings_normalized), sample_size, replace=False)
|
||||
# sample_data = self.embeddings_normalized[sample_indices]
|
||||
|
||||
# from scipy.spatial.distance import pdist
|
||||
# cosine_distances = pdist(sample_data, metric='cosine')
|
||||
|
||||
# # Add statistical measures as eps candidates using cosine distances
|
||||
# eps_candidates.extend([
|
||||
# np.mean(cosine_distances) * 0.3,
|
||||
# np.mean(cosine_distances) * 0.4,
|
||||
# np.mean(cosine_distances) * 0.5,
|
||||
# np.mean(cosine_distances) * 0.6,
|
||||
# np.mean(cosine_distances) * 0.7,
|
||||
# np.median(cosine_distances) * 0.3,
|
||||
# np.median(cosine_distances) * 0.4,
|
||||
# np.median(cosine_distances) * 0.5,
|
||||
# np.median(cosine_distances) * 0.6,
|
||||
# np.std(cosine_distances) * 0.5,
|
||||
# np.std(cosine_distances) * 0.8,
|
||||
# np.std(cosine_distances) * 1.0,
|
||||
# np.std(cosine_distances) * 1.2
|
||||
# ])
|
||||
|
||||
|
||||
# Method 3: Manual eps values for cosine distances (0-2 range)
|
||||
manual_eps = [0.001, 0.002, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
|
||||
0.6, 0.7, 0.8, 0.9, 1.0]
|
||||
# eps_candidates.extend(manual_eps)
|
||||
|
||||
# eps_candidates = manual_eps
|
||||
eps_candidates = [0.2]
|
||||
|
||||
# Remove duplicates and invalid values, then sort
|
||||
eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0])))
|
||||
|
||||
# Extensive min_samples candidates
|
||||
# min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50]
|
||||
|
||||
# Filter min_samples based on dataset size
|
||||
# max_min_samples = len(self.embeddings_normalized) // 10 # At most 10% of data
|
||||
# min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples]
|
||||
min_samples_candidates = [50]
|
||||
|
||||
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_labels = None
|
||||
|
||||
print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values")
|
||||
print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}")
|
||||
print("This may take a while...\n")
|
||||
|
||||
# Track all results for analysis
|
||||
all_results = []
|
||||
|
||||
total_combinations = len(eps_candidates) * len(min_samples_candidates)
|
||||
current_combination = 0
|
||||
|
||||
for eps in eps_candidates:
|
||||
for min_samples in min_samples_candidates:
|
||||
current_combination += 1
|
||||
|
||||
# Progress indicator
|
||||
if current_combination % 50 == 0 or current_combination == total_combinations:
|
||||
progress = (current_combination / total_combinations) * 100
|
||||
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)")
|
||||
|
||||
try:
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
|
||||
labels = dbscan.fit_predict(self.embeddings_normalized)
|
||||
|
||||
# Check if we have meaningful clusters
|
||||
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
n_noise = list(labels).count(-1)
|
||||
noise_ratio = n_noise / len(labels)
|
||||
|
||||
# Store result for analysis
|
||||
result_info = {
|
||||
'eps': eps,
|
||||
'min_samples': min_samples,
|
||||
'n_clusters': n_clusters,
|
||||
'n_noise': n_noise,
|
||||
'noise_ratio': noise_ratio
|
||||
}
|
||||
|
||||
# Check if we have meaningful clusters
|
||||
print(n_clusters, n_noise, noise_ratio, eps, min_samples)
|
||||
if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points
|
||||
# Calculate silhouette score (excluding noise) using cosine metric
|
||||
mask = labels != -1
|
||||
if np.sum(mask) > 1:
|
||||
try:
|
||||
score = silhouette_score(self.embeddings_normalized[mask], labels[mask], metric='cosine')
|
||||
result_info['silhouette_score'] = score
|
||||
|
||||
# Print promising results
|
||||
if score > 0.1: # Only show decent scores
|
||||
print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}")
|
||||
|
||||
print(value_counts(labels))
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_params = (eps, min_samples)
|
||||
best_labels = labels
|
||||
except Exception:
|
||||
result_info['silhouette_score'] = None
|
||||
else:
|
||||
result_info['silhouette_score'] = None
|
||||
|
||||
all_results.append(result_info)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Skip problematic parameter combinations
|
||||
continue
|
||||
|
||||
# Analysis of results
|
||||
print("\n" + "="*50)
|
||||
print("DBSCAN GRID SEARCH ANALYSIS")
|
||||
print("="*50)
|
||||
|
||||
if all_results:
|
||||
# Convert to numpy for easier analysis
|
||||
import pandas as pd
|
||||
df_results = pd.DataFrame(all_results)
|
||||
|
||||
print(f"Total parameter combinations tested: {len(df_results)}")
|
||||
|
||||
# Valid results (with clusters)
|
||||
valid_results = df_results[df_results['n_clusters'] >= 2]
|
||||
print(f"Combinations that produced clusters: {len(valid_results)}")
|
||||
|
||||
if len(valid_results) > 0:
|
||||
# Best silhouette scores
|
||||
scored_results = valid_results.dropna(subset=['silhouette_score'])
|
||||
if len(scored_results) > 0:
|
||||
print(f"Combinations with valid silhouette scores: {len(scored_results)}")
|
||||
print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}")
|
||||
print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}")
|
||||
|
||||
# Top 5 results
|
||||
top_results = scored_results.nlargest(5, 'silhouette_score')
|
||||
print("\nTop 5 parameter combinations:")
|
||||
for idx, row in top_results.iterrows():
|
||||
print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: "
|
||||
f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}")
|
||||
|
||||
# Cluster count distribution
|
||||
cluster_counts = valid_results['n_clusters'].value_counts().sort_index()
|
||||
print("\nCluster count distribution:")
|
||||
for n_clusters, count in cluster_counts.items():
|
||||
print(f" {n_clusters} clusters: {count} parameter combinations")
|
||||
|
||||
print("\n📁 SAVING DETAILED RESULTS...")
|
||||
print("="*30)
|
||||
|
||||
# Save detailed grid search results to JSON file
|
||||
self.save_dbscan_grid_search_results(all_results, best_params, best_score)
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0)
|
||||
n_noise = list(best_labels).count(-1)
|
||||
|
||||
print("\nBest DBSCAN result:")
|
||||
print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("DBSCAN could not find suitable clusters with the extensive grid search")
|
||||
print("Consider:")
|
||||
print("- Adjusting the embedding space (different model or preprocessing)")
|
||||
print("- Using different clustering algorithms")
|
||||
print("- Manual parameter tuning based on domain knowledge")
|
||||
return None
|
||||
|
||||
def save_dbscan_grid_search_results(self, all_results, best_params, best_score):
|
||||
"""Save detailed DBSCAN grid search results to JSON file"""
|
||||
import datetime
|
||||
|
||||
# Prepare comprehensive results data
|
||||
grid_search_data = {
|
||||
"experiment_info": {
|
||||
"timestamp": datetime.datetime.now().isoformat(),
|
||||
"dataset_path": self.embeddings_path,
|
||||
"total_samples": len(self.file_paths),
|
||||
"embedding_dimension": self.embeddings.shape[1],
|
||||
"total_combinations_tested": len(all_results)
|
||||
},
|
||||
"best_result": {
|
||||
"eps": best_params[0] if best_params else None,
|
||||
"min_samples": best_params[1] if best_params else None,
|
||||
"silhouette_score": best_score if best_score > -1 else None
|
||||
},
|
||||
"all_trials": []
|
||||
}
|
||||
|
||||
# Add all trial results
|
||||
for i, result in enumerate(all_results):
|
||||
trial_data = {
|
||||
"trial_id": i + 1,
|
||||
"parameters": {
|
||||
"eps": result['eps'],
|
||||
"min_samples": result['min_samples']
|
||||
},
|
||||
"results": {
|
||||
"n_clusters": result['n_clusters'],
|
||||
"n_noise": result['n_noise'],
|
||||
"noise_ratio": result['noise_ratio'],
|
||||
"silhouette_score": result['silhouette_score']
|
||||
},
|
||||
"status": "success" if result['silhouette_score'] is not None else "failed"
|
||||
}
|
||||
grid_search_data["all_trials"].append(trial_data)
|
||||
|
||||
# Calculate summary statistics
|
||||
valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"]
|
||||
if valid_trials:
|
||||
silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None]
|
||||
if silhouette_scores:
|
||||
grid_search_data["summary_statistics"] = {
|
||||
"total_trials": len(all_results),
|
||||
"successful_trials": len(valid_trials),
|
||||
"success_rate": len(valid_trials) / len(all_results),
|
||||
"best_silhouette_score": max(silhouette_scores),
|
||||
"worst_silhouette_score": min(silhouette_scores),
|
||||
"mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores),
|
||||
"median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2]
|
||||
}
|
||||
|
||||
# Top 10 results
|
||||
sorted_valid_trials = sorted(valid_trials,
|
||||
key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1,
|
||||
reverse=True)
|
||||
grid_search_data["top_10_results"] = sorted_valid_trials[:10]
|
||||
|
||||
# Parameter analysis
|
||||
eps_values = [t["parameters"]["eps"] for t in valid_trials]
|
||||
min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials]
|
||||
|
||||
grid_search_data["parameter_analysis"] = {
|
||||
"eps_range": {
|
||||
"min": min(eps_values),
|
||||
"max": max(eps_values),
|
||||
"mean": sum(eps_values) / len(eps_values)
|
||||
},
|
||||
"min_samples_range": {
|
||||
"min": min(min_samples_values),
|
||||
"max": max(min_samples_values),
|
||||
"mean": sum(min_samples_values) / len(min_samples_values)
|
||||
}
|
||||
}
|
||||
|
||||
# Save to file with timestamp
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
# filename = f"dbscan_grid_search_detailed_{timestamp}.json"
|
||||
filename = "dbscan_grid_search_detailed.json"
|
||||
|
||||
print(grid_search_data.keys())
|
||||
print(type(grid_search_data['parameter_analysis']))
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Detailed grid search results saved to: {filename}")
|
||||
|
||||
# Also save a CSV summary for easy analysis
|
||||
# csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv"
|
||||
csv_filename = "dbscan_grid_search_summary.csv"
|
||||
|
||||
self.save_grid_search_csv(all_results, csv_filename)
|
||||
print(f"Grid search summary CSV saved to: {csv_filename}")
|
||||
|
||||
def save_grid_search_csv(self, all_results, filename):
|
||||
"""Save grid search results as CSV for easy analysis"""
|
||||
import csv
|
||||
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise',
|
||||
'noise_ratio', 'silhouette_score', 'status']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for i, result in enumerate(all_results):
|
||||
writer.writerow({
|
||||
'trial_id': i + 1,
|
||||
'eps': result['eps'],
|
||||
'min_samples': result['min_samples'],
|
||||
'n_clusters': result['n_clusters'],
|
||||
'n_noise': result['n_noise'],
|
||||
'noise_ratio': result['noise_ratio'],
|
||||
'silhouette_score': result['silhouette_score'],
|
||||
'status': 'success' if result['silhouette_score'] is not None else 'failed'
|
||||
})
|
||||
|
||||
def run_mean_shift(self):
|
||||
"""Run Mean Shift clustering"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING MEAN SHIFT CLUSTERING")
|
||||
print("="*50)
|
||||
|
||||
# Estimate bandwidth using different percentiles with cosine metric
|
||||
from sklearn.cluster import estimate_bandwidth
|
||||
|
||||
# Try different bandwidth estimation methods
|
||||
bandwidth_candidates = []
|
||||
|
||||
# Method 1: sklearn's estimate_bandwidth (note: estimate_bandwidth doesn't support cosine directly)
|
||||
try:
|
||||
bw_est = estimate_bandwidth(self.embeddings_normalized, quantile=0.3, n_samples=min(500, len(self.embeddings_normalized)))
|
||||
if bw_est > 0:
|
||||
bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Method 2: nearest neighbor cosine distances
|
||||
neighbors = NearestNeighbors(n_neighbors=10, metric='cosine')
|
||||
neighbors_fit = neighbors.fit(self.embeddings_normalized)
|
||||
distances, _ = neighbors_fit.kneighbors(self.embeddings_normalized)
|
||||
mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance
|
||||
bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5])
|
||||
|
||||
# Remove duplicates and invalid values
|
||||
bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0]))
|
||||
|
||||
if not bandwidth_candidates:
|
||||
bandwidth_candidates = [0.5, 1.0, 1.5, 2.0]
|
||||
|
||||
best_score = -1
|
||||
best_bandwidth = None
|
||||
best_labels = None
|
||||
|
||||
print("Testing different bandwidth values...")
|
||||
for bandwidth in bandwidth_candidates:
|
||||
try:
|
||||
mean_shift = MeanShift(bandwidth=bandwidth)
|
||||
labels = mean_shift.fit_predict(self.embeddings_normalized)
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
if 2 <= n_clusters <= len(self.embeddings_normalized) // 3:
|
||||
score = silhouette_score(self.embeddings_normalized, labels, metric='cosine')
|
||||
print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_bandwidth = bandwidth
|
||||
best_labels = labels
|
||||
except Exception as e:
|
||||
print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)")
|
||||
continue
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels))
|
||||
print("\nBest Mean Shift result:")
|
||||
print(f"Bandwidth: {best_bandwidth:.4f}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("Mean Shift could not find suitable clusters")
|
||||
return None
|
||||
|
||||
def run_affinity_propagation(self):
|
||||
"""Run Affinity Propagation clustering"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING AFFINITY PROPAGATION CLUSTERING")
|
||||
print("="*50)
|
||||
|
||||
# Calculate similarity matrix using cosine similarity
|
||||
# Convert cosine distance to cosine similarity: similarity = 1 - distance
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
similarities = cosine_similarity(self.embeddings_normalized)
|
||||
|
||||
# Try different preference values (percentiles of similarity matrix diagonal)
|
||||
preference_candidates = [
|
||||
np.percentile(similarities.diagonal(), 10),
|
||||
np.percentile(similarities.diagonal(), 25),
|
||||
np.percentile(similarities.diagonal(), 50),
|
||||
np.median(similarities),
|
||||
np.percentile(similarities.diagonal(), 75)
|
||||
]
|
||||
|
||||
damping_candidates = [0.5, 0.7, 0.8, 0.9]
|
||||
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_labels = None
|
||||
|
||||
print("Testing different parameter combinations...")
|
||||
for preference in preference_candidates:
|
||||
for damping in damping_candidates:
|
||||
try:
|
||||
affinity_prop = AffinityPropagation(
|
||||
preference=preference,
|
||||
damping=damping,
|
||||
random_state=42,
|
||||
max_iter=200
|
||||
)
|
||||
labels = affinity_prop.fit_predict(self.embeddings_normalized)
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
if 2 <= n_clusters <= len(self.embeddings_normalized) // 3:
|
||||
score = silhouette_score(self.embeddings_normalized, labels, metric='cosine')
|
||||
print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_params = (preference, damping)
|
||||
best_labels = labels
|
||||
except Exception as e:
|
||||
print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)")
|
||||
continue
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels))
|
||||
print("\nBest Affinity Propagation result:")
|
||||
print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("Affinity Propagation could not find suitable clusters")
|
||||
return None
|
||||
|
||||
def visualize_results(self, results_dict):
|
||||
"""Visualize clustering results using PCA"""
|
||||
if not results_dict:
|
||||
print("No results to visualize")
|
||||
return
|
||||
|
||||
# Reduce dimensions for visualization
|
||||
pca = PCA(n_components=2, random_state=42)
|
||||
embeddings_2d = pca.fit_transform(self.embeddings_normalized)
|
||||
|
||||
n_methods = len(results_dict)
|
||||
fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4))
|
||||
|
||||
if n_methods == 1:
|
||||
axes = [axes]
|
||||
|
||||
for idx, (method_name, labels) in enumerate(results_dict.items()):
|
||||
# Handle noise points in DBSCAN (label -1)
|
||||
unique_labels = set(labels)
|
||||
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
|
||||
|
||||
for label, color in zip(unique_labels, colors):
|
||||
if label == -1:
|
||||
# Noise points in black
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c='black', marker='x', s=20, alpha=0.5, label='Noise')
|
||||
else:
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
|
||||
|
||||
axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)')
|
||||
axes[idx].set_xlabel('PCA Component 1')
|
||||
axes[idx].set_ylabel('PCA Component 2')
|
||||
axes[idx].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print("\nVisualization saved as 'auto_clustering_results.png'")
|
||||
|
||||
def save_results(self, results_dict):
|
||||
"""Save clustering results to JSON files"""
|
||||
print(results_dict.items())
|
||||
check_method_name = []
|
||||
print(len(results_dict))
|
||||
for method_name, labels in results_dict.items():
|
||||
check_method_name.append(method_name)
|
||||
# Create results for each method
|
||||
method_results = []
|
||||
print(method_name == 'DBSCAN')
|
||||
|
||||
|
||||
|
||||
for filepath, label in zip(self.file_paths, labels):
|
||||
if method_name == 'DBSCAN':
|
||||
if label == -1:
|
||||
is_noise = True
|
||||
else:
|
||||
is_noise = False
|
||||
else:
|
||||
is_noise = False
|
||||
|
||||
method_results.append({
|
||||
"filepath": filepath,
|
||||
"cluster": int(label),
|
||||
"is_noise": is_noise
|
||||
})
|
||||
print('method_name', set(check_method_name))
|
||||
print(method_results[0]['is_noise'])
|
||||
print(method_results[0])
|
||||
|
||||
# Save to file
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"{method_name.lower().replace(' ', '_')}_results_{timestamp}.json"
|
||||
with open(filename, 'w') as f:
|
||||
json.dump({
|
||||
"method": method_name,
|
||||
"n_clusters": len(set(labels)) - (1 if -1 in labels else 0),
|
||||
"n_samples": len(labels),
|
||||
"results": method_results
|
||||
}, f, indent=4)
|
||||
|
||||
print(f"Results saved to {filename}")
|
||||
|
||||
def run_all_methods(self):
|
||||
"""Run all automatic clustering methods"""
|
||||
print("\n" + "="*70)
|
||||
print("AUTOMATIC CLUSTERING ANALYSIS")
|
||||
print("="*70)
|
||||
print(f"Dataset: {len(self.file_paths)} documents")
|
||||
print(f"Embedding dimension: {self.embeddings.shape[1]}")
|
||||
|
||||
results = {}
|
||||
|
||||
# Run DBSCAN
|
||||
dbscan_labels = self.run_dbscan()
|
||||
if dbscan_labels is not None:
|
||||
results["DBSCAN"] = dbscan_labels
|
||||
|
||||
# Run Mean Shift
|
||||
# meanshift_labels = self.run_mean_shift()
|
||||
# if meanshift_labels is not None:
|
||||
# results["Mean Shift"] = meanshift_labels
|
||||
|
||||
# Run Affinity Propagation
|
||||
# affinity_labels = self.run_affinity_propagation()
|
||||
# if affinity_labels is not None:
|
||||
# results["Affinity Propagation"] = affinity_labels
|
||||
|
||||
# Summary
|
||||
if results:
|
||||
print("\n" + "="*70)
|
||||
print("SUMMARY OF RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for method, labels in results.items():
|
||||
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
if method == "DBSCAN":
|
||||
n_noise = list(labels).count(-1)
|
||||
print(f"{method}: {n_clusters} clusters, {n_noise} noise points")
|
||||
else:
|
||||
print(f"{method}: {n_clusters} clusters")
|
||||
|
||||
# Calculate agreement between methods if multiple succeeded
|
||||
if len(results) > 1:
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
print("\nMethod Agreement (Adjusted Rand Index):")
|
||||
method_names = list(results.keys())
|
||||
for i in range(len(method_names)):
|
||||
for j in range(i+1, len(method_names)):
|
||||
ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]])
|
||||
print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}")
|
||||
|
||||
# Visualize and save results
|
||||
self.visualize_results(results)
|
||||
self.save_results(results)
|
||||
|
||||
else:
|
||||
print("\nNo automatic clustering method found suitable clusters.")
|
||||
print("This might indicate:")
|
||||
print("- Data doesn't have clear cluster structure")
|
||||
print("- Embeddings need different preprocessing")
|
||||
print("- Different parameter ranges needed")
|
||||
|
||||
return results
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings")
|
||||
parser.add_argument("--embeddings_path", help="Path to embeddings JSON file")
|
||||
parser.add_argument("--method", choices=['None', 'dbscan', 'meanshift', 'affinity', 'all'], default='all',
|
||||
help="Which automatic method to run")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize clustering
|
||||
clustering = AutoClustering(args.embeddings_path)
|
||||
|
||||
# Run selected method(s)
|
||||
if args.method == 'all':
|
||||
clustering.run_all_methods()
|
||||
elif args.method == 'dbscan':
|
||||
labels = clustering.run_dbscan()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"DBSCAN": labels})
|
||||
clustering.save_results({"DBSCAN": labels})
|
||||
elif args.method == 'meanshift':
|
||||
labels = clustering.run_mean_shift()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"Mean Shift": labels})
|
||||
clustering.save_results({"Mean Shift": labels})
|
||||
elif args.method == 'affinity':
|
||||
labels = clustering.run_affinity_propagation()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"Affinity Propagation": labels})
|
||||
clustering.save_results({"Affinity Propagation": labels})
|
||||
elif args.method == 'None':
|
||||
pass
|
||||
if __name__ == "__main__":
|
||||
main()
|
BIN
cluster/auto_clustering_results.png
Normal file
BIN
cluster/auto_clustering_results.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 364 KiB |
2
cluster/dbscan_grid_search_summary.csv
Normal file
2
cluster/dbscan_grid_search_summary.csv
Normal file
@@ -0,0 +1,2 @@
|
||||
trial_id,eps,min_samples,n_clusters,n_noise,noise_ratio,silhouette_score,status
|
||||
1,0.2,50,5,374,0.13357142857142856,0.6100894212722778,success
|
|
23
cluster/dbscan_grid_search_summary_20250731_232801.csv
Normal file
23
cluster/dbscan_grid_search_summary_20250731_232801.csv
Normal file
@@ -0,0 +1,23 @@
|
||||
trial_id,eps,min_samples,n_clusters,n_noise,noise_ratio,silhouette_score,status
|
||||
1,0.001,50,0,2800,1.0,,failed
|
||||
2,0.002,50,0,2800,1.0,,failed
|
||||
3,0.005,50,0,2800,1.0,,failed
|
||||
4,0.01,50,2,2436,0.87,0.8994060754776001,success
|
||||
5,0.02,50,2,2220,0.7928571428571428,0.7592437863349915,success
|
||||
6,0.03,50,1,2168,0.7742857142857142,,failed
|
||||
7,0.04,50,1,2157,0.7703571428571429,,failed
|
||||
8,0.05,50,2,2089,0.7460714285714286,0.8926841616630554,success
|
||||
9,0.1,50,6,1204,0.43,0.6831505298614502,success
|
||||
10,0.15,50,4,645,0.23035714285714284,0.6648684740066528,success
|
||||
11,0.2,50,5,374,0.13357142857142856,0.6100894212722778,success
|
||||
12,0.25,50,3,258,0.09214285714285714,0.41854172945022583,success
|
||||
13,0.3,50,1,210,0.075,,failed
|
||||
14,0.35,50,1,163,0.05821428571428571,,failed
|
||||
15,0.4,50,1,145,0.05178571428571429,,failed
|
||||
16,0.45,50,1,123,0.04392857142857143,,failed
|
||||
17,0.5,50,1,107,0.038214285714285715,,failed
|
||||
18,0.6,50,1,23,0.008214285714285714,,failed
|
||||
19,0.7,50,1,0,0.0,,failed
|
||||
20,0.8,50,1,0,0.0,,failed
|
||||
21,0.9,50,1,0,0.0,,failed
|
||||
22,1.0,50,1,0,0.0,,failed
|
|
23
cluster/dbscan_grid_search_summary_20250731_232811.csv
Normal file
23
cluster/dbscan_grid_search_summary_20250731_232811.csv
Normal file
@@ -0,0 +1,23 @@
|
||||
trial_id,eps,min_samples,n_clusters,n_noise,noise_ratio,silhouette_score,status
|
||||
1,0.001,50,0,2800,1.0,,failed
|
||||
2,0.002,50,0,2800,1.0,,failed
|
||||
3,0.005,50,0,2800,1.0,,failed
|
||||
4,0.01,50,2,2436,0.87,0.8994060754776001,success
|
||||
5,0.02,50,2,2220,0.7928571428571428,0.7592437863349915,success
|
||||
6,0.03,50,1,2168,0.7742857142857142,,failed
|
||||
7,0.04,50,1,2157,0.7703571428571429,,failed
|
||||
8,0.05,50,2,2089,0.7460714285714286,0.8926841616630554,success
|
||||
9,0.1,50,6,1204,0.43,0.6831505298614502,success
|
||||
10,0.15,50,4,645,0.23035714285714284,0.6648684740066528,success
|
||||
11,0.2,50,5,374,0.13357142857142856,0.6100894212722778,success
|
||||
12,0.25,50,3,258,0.09214285714285714,0.41854172945022583,success
|
||||
13,0.3,50,1,210,0.075,,failed
|
||||
14,0.35,50,1,163,0.05821428571428571,,failed
|
||||
15,0.4,50,1,145,0.05178571428571429,,failed
|
||||
16,0.45,50,1,123,0.04392857142857143,,failed
|
||||
17,0.5,50,1,107,0.038214285714285715,,failed
|
||||
18,0.6,50,1,23,0.008214285714285714,,failed
|
||||
19,0.7,50,1,0,0.0,,failed
|
||||
20,0.8,50,1,0,0.0,,failed
|
||||
21,0.9,50,1,0,0.0,,failed
|
||||
22,1.0,50,1,0,0.0,,failed
|
|
BIN
cluster/gmm_clustering_results.png
Normal file
BIN
cluster/gmm_clustering_results.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 747 KiB |
649
cluster/gmm_extensive.py
Normal file
649
cluster/gmm_extensive.py
Normal file
@@ -0,0 +1,649 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extensive Gaussian Mixture Model clustering with grid search for optimal parameters
|
||||
Includes BIC and AIC metrics for model selection
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.mixture import GaussianMixture
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
||||
from sklearn.decomposition import PCA
|
||||
import datetime
|
||||
import csv
|
||||
import argparse
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
class GMMExtensiveClustering:
|
||||
def __init__(self, embeddings_path):
|
||||
self.embeddings_path = embeddings_path
|
||||
self.embeddings = None
|
||||
self.file_paths = None
|
||||
self.load_embeddings()
|
||||
|
||||
def load_embeddings(self):
|
||||
"""Load embeddings from JSON file"""
|
||||
print(f"Loading embeddings from {self.embeddings_path}...")
|
||||
with open(self.embeddings_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.file_paths = []
|
||||
embeddings_list = []
|
||||
|
||||
for item in data:
|
||||
self.file_paths.append(item['filepath'])
|
||||
embeddings_list.append(item['embedding'])
|
||||
|
||||
self.embeddings = np.array(embeddings_list, dtype=np.float32)
|
||||
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
|
||||
|
||||
# Standardize embeddings for better clustering
|
||||
self.scaler = StandardScaler()
|
||||
self.embeddings_scaled = self.scaler.fit_transform(self.embeddings)
|
||||
|
||||
def run_gmm_grid_search(self):
|
||||
"""Run GMM with optimized grid search for faster execution"""
|
||||
print("\n" + "="*70)
|
||||
print("RUNNING GAUSSIAN MIXTURE MODEL CLUSTERING WITH OPTIMIZED GRID SEARCH")
|
||||
print("="*70)
|
||||
|
||||
# Optimized GMM parameter candidates for faster execution
|
||||
|
||||
# Smart n_components range with larger steps
|
||||
max_components = min(50, len(self.embeddings_scaled) // 20) # Reduced max and increased divisor
|
||||
n_components_candidates = []
|
||||
|
||||
# Progressive step sizes: smaller steps for low numbers, larger for high
|
||||
for n in range(2, max_components + 1):
|
||||
if n <= 5:
|
||||
n_components_candidates.append(n) # 2, 3, 4, 5
|
||||
elif n <= 10:
|
||||
if n % 2 == 0: # 6, 8, 10
|
||||
n_components_candidates.append(n)
|
||||
else:
|
||||
if n % 3 == 2: # 11, 14, 17, 20
|
||||
n_components_candidates.append(n)
|
||||
|
||||
# Reduced covariance types - focus on most important ones
|
||||
covariance_types = [
|
||||
# 'full', 'diag',
|
||||
'tied', 'spherical'
|
||||
] # Removed 'tied' and 'spherical' as they're less common
|
||||
|
||||
# Simplified regularization - focus on key values
|
||||
reg_covar_candidates = [1e-5, 1e-4, 1e-3] # Removed extreme values
|
||||
|
||||
# Reduced n_init - 1 is often sufficient for good initialization methods
|
||||
n_init_candidates = [1, 5] # Removed 10 to save time
|
||||
|
||||
# Focus on best initialization methods
|
||||
init_params_candidates = ['kmeans', 'k-means++'] # Removed 'random' and 'random_from_data'
|
||||
|
||||
# Simplified max_iter - most problems converge quickly
|
||||
max_iter_candidates = [100, 300] # Removed 500, added 300 as middle ground
|
||||
|
||||
print(f"Optimized parameter combinations:")
|
||||
print(f" - n_components: {len(n_components_candidates)} values {n_components_candidates}")
|
||||
print(f" - covariance_types: {len(covariance_types)} options {covariance_types}")
|
||||
print(f" - reg_covar: {len(reg_covar_candidates)} values {reg_covar_candidates}")
|
||||
print(f" - n_init: {len(n_init_candidates)} values {n_init_candidates}")
|
||||
print(f" - init_params: {len(init_params_candidates)} options {init_params_candidates}")
|
||||
print(f" - max_iter: {len(max_iter_candidates)} values {max_iter_candidates}")
|
||||
|
||||
total_combinations = (len(n_components_candidates) * len(covariance_types) *
|
||||
len(reg_covar_candidates) * len(n_init_candidates) *
|
||||
len(init_params_candidates) * len(max_iter_candidates))
|
||||
print(f"Total combinations: {total_combinations} (optimized for speed)")
|
||||
|
||||
# Estimate time
|
||||
estimated_time_per_combination = 0.5 # seconds (conservative estimate)
|
||||
estimated_total_time = total_combinations * estimated_time_per_combination
|
||||
print(f"Estimated runtime: {estimated_total_time/60:.1f} minutes")
|
||||
print("This should be much faster...\n")
|
||||
|
||||
# Track all results for analysis
|
||||
all_results = []
|
||||
|
||||
# Early stopping criteria for speed optimization
|
||||
early_stopping_threshold = 0.7 # If we find a very good silhouette score, we can be less exhaustive
|
||||
good_results_found = 0
|
||||
max_good_results = 5 # Stop early if we find several very good results
|
||||
|
||||
best_bic_score = float('inf')
|
||||
best_aic_score = float('inf')
|
||||
best_silhouette_score = -1
|
||||
best_params_bic = None
|
||||
best_params_aic = None
|
||||
best_params_silhouette = None
|
||||
best_labels_bic = None
|
||||
best_labels_aic = None
|
||||
best_labels_silhouette = None
|
||||
|
||||
current_combination = 0
|
||||
|
||||
# Optimized iteration order: test simpler models first (fewer components, simpler covariance)
|
||||
for covariance_type in covariance_types: # Start with covariance type
|
||||
for n_components in n_components_candidates: # Then components
|
||||
for init_params in init_params_candidates: # Good initialization methods
|
||||
for reg_covar in reg_covar_candidates: # Regularization
|
||||
for n_init in n_init_candidates: # Number of initializations
|
||||
for max_iter in max_iter_candidates: # Iterations last
|
||||
current_combination += 1
|
||||
|
||||
# Progress indicator with time estimation
|
||||
if current_combination % 50 == 0 or current_combination == total_combinations:
|
||||
progress = (current_combination / total_combinations) * 100
|
||||
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%) - "
|
||||
f"Best scores so far: BIC={best_bic_score:.2f}, Silhouette={best_silhouette_score:.3f}")
|
||||
|
||||
try:
|
||||
# Early convergence check for faster models
|
||||
tol = 1e-3 if n_components <= 5 else 1e-4 # Less strict tolerance for simple models
|
||||
|
||||
# Run GMM
|
||||
gmm = GaussianMixture(
|
||||
n_components=n_components,
|
||||
covariance_type=covariance_type,
|
||||
reg_covar=reg_covar,
|
||||
n_init=n_init,
|
||||
init_params=init_params,
|
||||
max_iter=max_iter,
|
||||
tol=tol, # Added tolerance for faster convergence
|
||||
random_state=42
|
||||
)
|
||||
|
||||
# Fit and predict
|
||||
gmm.fit(self.embeddings_scaled)
|
||||
labels = gmm.predict(self.embeddings_scaled)
|
||||
|
||||
# Quick validation - skip if model didn't converge properly
|
||||
if not gmm.converged_ and max_iter <= 100:
|
||||
continue # Skip non-converged simple models
|
||||
|
||||
# Calculate metrics
|
||||
bic_score = gmm.bic(self.embeddings_scaled)
|
||||
aic_score = gmm.aic(self.embeddings_scaled)
|
||||
log_likelihood = gmm.score(self.embeddings_scaled)
|
||||
|
||||
# Only calculate clustering metrics if we have multiple clusters
|
||||
if len(set(labels)) > 1:
|
||||
silhouette = silhouette_score(self.embeddings_scaled, labels)
|
||||
calinski_harabasz = calinski_harabasz_score(self.embeddings_scaled, labels)
|
||||
davies_bouldin = davies_bouldin_score(self.embeddings_scaled, labels)
|
||||
|
||||
# Early stopping check
|
||||
if silhouette > early_stopping_threshold:
|
||||
good_results_found += 1
|
||||
print(f"🎯 Excellent result found: n_comp={n_components}, cov={covariance_type}, "
|
||||
f"silhouette={silhouette:.4f}")
|
||||
|
||||
else:
|
||||
silhouette = -1
|
||||
calinski_harabasz = 0
|
||||
davies_bouldin = float('inf')
|
||||
|
||||
# Store result for analysis
|
||||
result_info = {
|
||||
'n_components': n_components,
|
||||
'covariance_type': covariance_type,
|
||||
'reg_covar': reg_covar,
|
||||
'n_init': n_init,
|
||||
'init_params': init_params,
|
||||
'max_iter': max_iter,
|
||||
'bic_score': bic_score,
|
||||
'aic_score': aic_score,
|
||||
'log_likelihood': log_likelihood,
|
||||
'silhouette_score': silhouette,
|
||||
'calinski_harabasz_score': calinski_harabasz,
|
||||
'davies_bouldin_score': davies_bouldin,
|
||||
'converged': gmm.converged_,
|
||||
'n_iter': gmm.n_iter_,
|
||||
'unique_clusters': len(set(labels))
|
||||
}
|
||||
|
||||
all_results.append(result_info)
|
||||
|
||||
# Print promising results
|
||||
if (silhouette > 0.3 and bic_score < np.percentile([r['bic_score'] for r in all_results], 25)):
|
||||
print(f"n_components={n_components}, cov={covariance_type}, init={init_params}: "
|
||||
f"BIC={bic_score:.2f}, AIC={aic_score:.2f}, silhouette={silhouette:.4f}")
|
||||
|
||||
# Track best results for different criteria
|
||||
if bic_score < best_bic_score:
|
||||
best_bic_score = bic_score
|
||||
best_params_bic = {
|
||||
'n_components': n_components,
|
||||
'covariance_type': covariance_type,
|
||||
'reg_covar': reg_covar,
|
||||
'n_init': n_init,
|
||||
'init_params': init_params,
|
||||
'max_iter': max_iter
|
||||
}
|
||||
best_labels_bic = labels
|
||||
|
||||
if aic_score < best_aic_score:
|
||||
best_aic_score = aic_score
|
||||
best_params_aic = {
|
||||
'n_components': n_components,
|
||||
'covariance_type': covariance_type,
|
||||
'reg_covar': reg_covar,
|
||||
'n_init': n_init,
|
||||
'init_params': init_params,
|
||||
'max_iter': max_iter
|
||||
}
|
||||
best_labels_aic = labels
|
||||
|
||||
if silhouette > best_silhouette_score and len(set(labels)) > 1:
|
||||
best_silhouette_score = silhouette
|
||||
best_params_silhouette = {
|
||||
'n_components': n_components,
|
||||
'covariance_type': covariance_type,
|
||||
'reg_covar': reg_covar,
|
||||
'n_init': n_init,
|
||||
'init_params': init_params,
|
||||
'max_iter': max_iter
|
||||
}
|
||||
best_labels_silhouette = labels
|
||||
|
||||
# Early stopping check
|
||||
if good_results_found >= 5 and silhouette > 0.6:
|
||||
print(f"🛑 Early stopping triggered: Found {good_results_found} excellent results. "
|
||||
f"Stopping at {current_combination}/{total_combinations} combinations.")
|
||||
break
|
||||
|
||||
except Exception:
|
||||
# Skip problematic parameter combinations
|
||||
continue
|
||||
|
||||
# Break from nested loops if early stopping triggered
|
||||
if good_results_found >= 5 and best_silhouette_score > 0.6:
|
||||
break
|
||||
if good_results_found >= 5 and best_silhouette_score > 0.6:
|
||||
break
|
||||
if good_results_found >= 5 and best_silhouette_score > 0.6:
|
||||
break
|
||||
if good_results_found >= 5 and best_silhouette_score > 0.6:
|
||||
break
|
||||
if good_results_found >= 5 and best_silhouette_score > 0.6:
|
||||
break
|
||||
|
||||
# Analysis of results
|
||||
print("\n" + "="*70)
|
||||
print("GAUSSIAN MIXTURE MODEL GRID SEARCH ANALYSIS")
|
||||
print("="*70)
|
||||
|
||||
if all_results:
|
||||
import pandas as pd
|
||||
df_results = pd.DataFrame(all_results)
|
||||
|
||||
print(f"Total parameter combinations tested: {len(df_results)}")
|
||||
|
||||
# Filter results with valid clustering (more than 1 cluster)
|
||||
valid_results = df_results[df_results['unique_clusters'] > 1]
|
||||
print(f"Combinations with valid clustering: {len(valid_results)}")
|
||||
|
||||
if len(valid_results) > 0:
|
||||
# Best scores analysis
|
||||
print(f"\nModel Selection Metrics:")
|
||||
print(f"Best BIC score: {df_results['bic_score'].min():.2f}")
|
||||
print(f"Best AIC score: {df_results['aic_score'].min():.2f}")
|
||||
print(f"Best Log-Likelihood: {df_results['log_likelihood'].max():.2f}")
|
||||
|
||||
print(f"\nClustering Quality Metrics:")
|
||||
print(f"Best silhouette score: {valid_results['silhouette_score'].max():.4f}")
|
||||
print(f"Mean silhouette score: {valid_results['silhouette_score'].mean():.4f}")
|
||||
print(f"Best Calinski-Harabasz score: {valid_results['calinski_harabasz_score'].max():.2f}")
|
||||
print(f"Best Davies-Bouldin score: {valid_results['davies_bouldin_score'].min():.4f}")
|
||||
|
||||
# Top results by different criteria
|
||||
print(f"\nTop 5 results by BIC (lower is better):")
|
||||
top_bic = df_results.nsmallest(5, 'bic_score')
|
||||
for idx, row in top_bic.iterrows():
|
||||
print(f" n_comp={row['n_components']}, cov={row['covariance_type']}: "
|
||||
f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}")
|
||||
|
||||
print(f"\nTop 5 results by AIC (lower is better):")
|
||||
top_aic = df_results.nsmallest(5, 'aic_score')
|
||||
for idx, row in top_aic.iterrows():
|
||||
print(f" n_comp={row['n_components']}, cov={row['covariance_type']}: "
|
||||
f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}")
|
||||
|
||||
if len(valid_results) > 0:
|
||||
print(f"\nTop 5 results by Silhouette Score:")
|
||||
top_silhouette = valid_results.nlargest(5, 'silhouette_score')
|
||||
for idx, row in top_silhouette.iterrows():
|
||||
print(f" n_comp={row['n_components']}, cov={row['covariance_type']}: "
|
||||
f"silhouette={row['silhouette_score']:.4f}")
|
||||
|
||||
# Component count analysis
|
||||
component_performance = df_results.groupby('n_components').agg({
|
||||
'bic_score': 'min',
|
||||
'aic_score': 'min',
|
||||
'silhouette_score': 'max'
|
||||
}).reset_index()
|
||||
|
||||
print(f"\nComponent count analysis (top 10 by BIC):")
|
||||
top_components = component_performance.nsmallest(10, 'bic_score')
|
||||
for idx, row in top_components.iterrows():
|
||||
print(f" {row['n_components']} components: "
|
||||
f"BIC={row['bic_score']:.2f}, AIC={row['aic_score']:.2f}, "
|
||||
f"silhouette={row['silhouette_score']:.4f}")
|
||||
|
||||
print(f"\n📁 SAVING DETAILED RESULTS...")
|
||||
print("="*30)
|
||||
|
||||
# Save detailed grid search results
|
||||
self.save_gmm_grid_search_results(all_results,
|
||||
best_params_bic, best_bic_score,
|
||||
best_params_aic, best_aic_score,
|
||||
best_params_silhouette, best_silhouette_score)
|
||||
|
||||
# Return best results based on BIC (primary), AIC (secondary), Silhouette (tertiary)
|
||||
results = {
|
||||
'bic': (best_labels_bic, best_params_bic, best_bic_score),
|
||||
'aic': (best_labels_aic, best_params_aic, best_aic_score),
|
||||
'silhouette': (best_labels_silhouette, best_params_silhouette, best_silhouette_score)
|
||||
}
|
||||
|
||||
# Print best results
|
||||
if best_labels_bic is not None:
|
||||
print(f"\nBest GMM result by BIC:")
|
||||
print(f"Parameters: {best_params_bic}")
|
||||
print(f"BIC score: {best_bic_score:.2f}")
|
||||
|
||||
if best_labels_aic is not None:
|
||||
print(f"\nBest GMM result by AIC:")
|
||||
print(f"Parameters: {best_params_aic}")
|
||||
print(f"AIC score: {best_aic_score:.2f}")
|
||||
|
||||
if best_labels_silhouette is not None:
|
||||
print(f"\nBest GMM result by Silhouette:")
|
||||
print(f"Parameters: {best_params_silhouette}")
|
||||
print(f"Silhouette score: {best_silhouette_score:.4f}")
|
||||
|
||||
return results
|
||||
|
||||
def save_gmm_grid_search_results(self, all_results,
|
||||
best_params_bic, best_bic_score,
|
||||
best_params_aic, best_aic_score,
|
||||
best_params_silhouette, best_silhouette_score):
|
||||
"""Save detailed GMM grid search results to JSON file"""
|
||||
|
||||
# Prepare comprehensive results data
|
||||
grid_search_data = {
|
||||
"experiment_info": {
|
||||
"timestamp": datetime.datetime.now().isoformat(),
|
||||
"dataset_path": self.embeddings_path,
|
||||
"total_samples": len(self.file_paths),
|
||||
"embedding_dimension": self.embeddings.shape[1],
|
||||
"total_combinations_tested": len(all_results),
|
||||
"method": "Gaussian Mixture Model"
|
||||
},
|
||||
"best_results": {
|
||||
"by_bic": {
|
||||
"parameters": best_params_bic,
|
||||
"bic_score": best_bic_score if best_bic_score != float('inf') else None
|
||||
},
|
||||
"by_aic": {
|
||||
"parameters": best_params_aic,
|
||||
"aic_score": best_aic_score if best_aic_score != float('inf') else None
|
||||
},
|
||||
"by_silhouette": {
|
||||
"parameters": best_params_silhouette,
|
||||
"silhouette_score": best_silhouette_score if best_silhouette_score > -1 else None
|
||||
}
|
||||
},
|
||||
"all_trials": []
|
||||
}
|
||||
|
||||
# Add all trial results
|
||||
for i, result in enumerate(all_results):
|
||||
trial_data = {
|
||||
"trial_id": i + 1,
|
||||
"parameters": {
|
||||
"n_components": result['n_components'],
|
||||
"covariance_type": result['covariance_type'],
|
||||
"reg_covar": result['reg_covar'],
|
||||
"n_init": result['n_init'],
|
||||
"init_params": result['init_params'],
|
||||
"max_iter": result['max_iter']
|
||||
},
|
||||
"results": {
|
||||
"bic_score": result['bic_score'],
|
||||
"aic_score": result['aic_score'],
|
||||
"log_likelihood": result['log_likelihood'],
|
||||
"silhouette_score": result['silhouette_score'],
|
||||
"calinski_harabasz_score": result['calinski_harabasz_score'],
|
||||
"davies_bouldin_score": result['davies_bouldin_score'],
|
||||
"converged": result['converged'],
|
||||
"n_iter": result['n_iter'],
|
||||
"unique_clusters": result['unique_clusters']
|
||||
}
|
||||
}
|
||||
grid_search_data["all_trials"].append(trial_data)
|
||||
|
||||
# Calculate summary statistics
|
||||
if all_results:
|
||||
bic_scores = [r['bic_score'] for r in all_results]
|
||||
aic_scores = [r['aic_score'] for r in all_results]
|
||||
log_likelihoods = [r['log_likelihood'] for r in all_results]
|
||||
|
||||
valid_silhouette = [r['silhouette_score'] for r in all_results if r['silhouette_score'] > -1]
|
||||
|
||||
grid_search_data["summary_statistics"] = {
|
||||
"total_trials": len(all_results),
|
||||
"valid_clustering_trials": len(valid_silhouette),
|
||||
"bic_score": {
|
||||
"best": min(bic_scores),
|
||||
"worst": max(bic_scores),
|
||||
"mean": sum(bic_scores) / len(bic_scores),
|
||||
"median": sorted(bic_scores)[len(bic_scores)//2]
|
||||
},
|
||||
"aic_score": {
|
||||
"best": min(aic_scores),
|
||||
"worst": max(aic_scores),
|
||||
"mean": sum(aic_scores) / len(aic_scores),
|
||||
"median": sorted(aic_scores)[len(aic_scores)//2]
|
||||
},
|
||||
"log_likelihood": {
|
||||
"best": max(log_likelihoods),
|
||||
"worst": min(log_likelihoods),
|
||||
"mean": sum(log_likelihoods) / len(log_likelihoods)
|
||||
}
|
||||
}
|
||||
|
||||
if valid_silhouette:
|
||||
grid_search_data["summary_statistics"]["silhouette_score"] = {
|
||||
"best": max(valid_silhouette),
|
||||
"worst": min(valid_silhouette),
|
||||
"mean": sum(valid_silhouette) / len(valid_silhouette),
|
||||
"median": sorted(valid_silhouette)[len(valid_silhouette)//2]
|
||||
}
|
||||
|
||||
# Top 10 results by different criteria
|
||||
sorted_by_bic = sorted(all_results, key=lambda x: x['bic_score'])
|
||||
sorted_by_aic = sorted(all_results, key=lambda x: x['aic_score'])
|
||||
valid_results = [r for r in all_results if r['silhouette_score'] > -1]
|
||||
sorted_by_silhouette = sorted(valid_results, key=lambda x: x['silhouette_score'], reverse=True)
|
||||
|
||||
grid_search_data["top_10_results"] = {
|
||||
"by_bic": [],
|
||||
"by_aic": [],
|
||||
"by_silhouette": []
|
||||
}
|
||||
|
||||
for i, result in enumerate(sorted_by_bic[:10]):
|
||||
grid_search_data["top_10_results"]["by_bic"].append({
|
||||
"rank": i + 1,
|
||||
"parameters": {
|
||||
"n_components": result['n_components'],
|
||||
"covariance_type": result['covariance_type'],
|
||||
"init_params": result['init_params']
|
||||
},
|
||||
"bic_score": result['bic_score'],
|
||||
"aic_score": result['aic_score']
|
||||
})
|
||||
|
||||
for i, result in enumerate(sorted_by_aic[:10]):
|
||||
grid_search_data["top_10_results"]["by_aic"].append({
|
||||
"rank": i + 1,
|
||||
"parameters": {
|
||||
"n_components": result['n_components'],
|
||||
"covariance_type": result['covariance_type'],
|
||||
"init_params": result['init_params']
|
||||
},
|
||||
"bic_score": result['bic_score'],
|
||||
"aic_score": result['aic_score']
|
||||
})
|
||||
|
||||
for i, result in enumerate(sorted_by_silhouette[:10]):
|
||||
grid_search_data["top_10_results"]["by_silhouette"].append({
|
||||
"rank": i + 1,
|
||||
"parameters": {
|
||||
"n_components": result['n_components'],
|
||||
"covariance_type": result['covariance_type'],
|
||||
"init_params": result['init_params']
|
||||
},
|
||||
"silhouette_score": result['silhouette_score']
|
||||
})
|
||||
|
||||
# Save to file with timestamp
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"gmm_grid_search_detailed_{timestamp}.json"
|
||||
# print()
|
||||
|
||||
# with open(filename, 'w') as f:
|
||||
# json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Detailed grid search results saved to: {filename}")
|
||||
|
||||
# Also save a CSV summary for easy analysis
|
||||
csv_filename = f"gmm_grid_search_summary_{timestamp}.csv"
|
||||
self.save_grid_search_csv(all_results, csv_filename)
|
||||
print(f"Grid search summary CSV saved to: {csv_filename}")
|
||||
|
||||
def save_grid_search_csv(self, all_results, filename):
|
||||
"""Save grid search results as CSV for easy analysis"""
|
||||
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['trial_id', 'n_components', 'covariance_type', 'reg_covar',
|
||||
'n_init', 'init_params', 'max_iter', 'bic_score', 'aic_score',
|
||||
'log_likelihood', 'silhouette_score', 'calinski_harabasz_score',
|
||||
'davies_bouldin_score', 'converged', 'n_iter', 'unique_clusters']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for i, result in enumerate(all_results):
|
||||
writer.writerow({
|
||||
'trial_id': i + 1,
|
||||
'n_components': result['n_components'],
|
||||
'covariance_type': result['covariance_type'],
|
||||
'reg_covar': result['reg_covar'],
|
||||
'n_init': result['n_init'],
|
||||
'init_params': result['init_params'],
|
||||
'max_iter': result['max_iter'],
|
||||
'bic_score': result['bic_score'],
|
||||
'aic_score': result['aic_score'],
|
||||
'log_likelihood': result['log_likelihood'],
|
||||
'silhouette_score': result['silhouette_score'],
|
||||
'calinski_harabasz_score': result['calinski_harabasz_score'],
|
||||
'davies_bouldin_score': result['davies_bouldin_score'],
|
||||
'converged': result['converged'],
|
||||
'n_iter': result['n_iter'],
|
||||
'unique_clusters': result['unique_clusters']
|
||||
})
|
||||
|
||||
def visualize_results(self, results):
|
||||
"""Visualize clustering results using PCA"""
|
||||
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
|
||||
|
||||
# Reduce dimensions for visualization
|
||||
pca = PCA(n_components=2, random_state=42)
|
||||
embeddings_2d = pca.fit_transform(self.embeddings_scaled)
|
||||
|
||||
methods = ['bic', 'aic', 'silhouette']
|
||||
titles = ['Best by BIC', 'Best by AIC', 'Best by Silhouette']
|
||||
|
||||
for idx, (method, title) in enumerate(zip(methods, titles)):
|
||||
labels, params, score = results[method]
|
||||
|
||||
if labels is not None:
|
||||
unique_labels = set(labels)
|
||||
colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labels)))
|
||||
|
||||
for label, color in zip(unique_labels, colors):
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
|
||||
|
||||
axes[idx].set_title(f'{title}\nn_components={params["n_components"]}, '
|
||||
f'cov={params["covariance_type"]}')
|
||||
else:
|
||||
axes[idx].text(0.5, 0.5, 'No valid clustering', ha='center', va='center',
|
||||
transform=axes[idx].transAxes, fontsize=12)
|
||||
axes[idx].set_title(f'{title}\n(Failed)')
|
||||
|
||||
axes[idx].set_xlabel('PCA Component 1')
|
||||
axes[idx].set_ylabel('PCA Component 2')
|
||||
axes[idx].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('gmm_clustering_results.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print(f"Visualization saved as 'gmm_clustering_results.png'")
|
||||
|
||||
def save_clustering_results(self, results):
|
||||
"""Save final clustering results to JSON files"""
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
for method in ['bic', 'aic', 'silhouette']:
|
||||
labels, params, score = results[method]
|
||||
|
||||
if labels is not None:
|
||||
clustering_results = []
|
||||
for filepath, label in zip(self.file_paths, labels):
|
||||
clustering_results.append({
|
||||
"filepath": filepath,
|
||||
"cluster": int(label)
|
||||
})
|
||||
|
||||
filename = f"gmm_final_results_{method}_{timestamp}.json"
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
json.dump({
|
||||
"method": f"GMM (best by {method.upper()})",
|
||||
"parameters": params,
|
||||
"n_components": params['n_components'],
|
||||
"n_samples": len(labels),
|
||||
f"{method}_score": score,
|
||||
"results": clustering_results
|
||||
}, f, indent=4)
|
||||
|
||||
print(f"Final clustering results ({method}) saved to: {filename}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run extensive Gaussian Mixture Model clustering on document embeddings")
|
||||
parser.add_argument("--embeddings_path", required=True, help="Path to embeddings JSON file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize clustering
|
||||
clustering = GMMExtensiveClustering(args.embeddings_path)
|
||||
|
||||
# Run extensive grid search
|
||||
results = clustering.run_gmm_grid_search()
|
||||
|
||||
if any(labels is not None for labels, _, _ in results.values()):
|
||||
# Visualize and save results
|
||||
clustering.visualize_results(results)
|
||||
clustering.save_clustering_results(results)
|
||||
print("\nGMM extensive clustering completed successfully!")
|
||||
else:
|
||||
print("\nGMM extensive clustering did not find suitable clusters.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
414
cluster/gmm_grid_search_summary_20250801_015245.csv
Normal file
414
cluster/gmm_grid_search_summary_20250801_015245.csv
Normal file
@@ -0,0 +1,414 @@
|
||||
trial_id,n_components,covariance_type,reg_covar,n_init,init_params,max_iter,bic_score,aic_score,log_likelihood,silhouette_score,calinski_harabasz_score,davies_bouldin_score,converged,n_iter,unique_clusters
|
||||
1,2,full,0.0001,1,kmeans,100,17260132.605124418,-7679507.0,2871.501,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
2,2,full,0.0001,1,kmeans,300,17260132.605124418,-7679507.0,2871.501,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
3,2,full,0.0001,5,kmeans,100,17260132.605124418,-7679507.0,2871.501,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
4,2,full,0.0001,5,kmeans,300,17260132.605124418,-7679507.0,2871.501,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
5,2,full,0.001,1,kmeans,100,20844797.605124418,-4094842.0,2231.382,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
6,2,full,0.001,1,kmeans,300,20844797.605124418,-4094842.0,2231.382,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
7,2,full,0.001,5,kmeans,100,20844797.605124418,-4094842.0,2231.382,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
8,2,full,0.001,5,kmeans,300,20844797.605124418,-4094842.0,2231.382,0.36924269795417786,1331.6861572265625,1.080913887973297,True,2,2
|
||||
9,2,full,0.0001,1,k-means++,100,17370120.605124418,-7569519.0,2851.86,0.3683019280433655,1320.3240966796875,1.0772816604479254,True,3,2
|
||||
10,2,full,0.0001,1,k-means++,300,17370120.605124418,-7569519.0,2851.86,0.3683019280433655,1320.3240966796875,1.0772816604479254,True,3,2
|
||||
11,2,full,0.0001,5,k-means++,100,17270534.605124418,-7669105.0,2869.6433,0.3693194091320038,1331.4493408203125,1.0799693510874797,True,3,2
|
||||
12,2,full,0.0001,5,k-means++,300,17270534.605124418,-7669105.0,2869.6433,0.3693194091320038,1331.4493408203125,1.0799693510874797,True,3,2
|
||||
13,2,full,0.001,1,k-means++,100,20919727.605124418,-4019912.0,2218.0017,0.3683019280433655,1320.3240966796875,1.0772816604479254,True,3,2
|
||||
14,2,full,0.001,1,k-means++,300,20919727.605124418,-4019912.0,2218.0017,0.3683019280433655,1320.3240966796875,1.0772816604479254,True,3,2
|
||||
15,2,full,0.001,5,k-means++,100,20851959.605124418,-4087680.0,2230.1033,0.3693194091320038,1331.4493408203125,1.0799693510874797,True,3,2
|
||||
16,2,full,0.001,5,k-means++,300,20851959.605124418,-4087680.0,2230.1033,0.3693194091320038,1331.4493408203125,1.0799693510874797,True,3,2
|
||||
17,3,full,0.0001,1,kmeans,100,33833558.37637398,-3575904.0,2888.795,0.37564584612846375,781.5426635742188,0.7905502894819209,True,2,3
|
||||
18,3,full,0.0001,1,kmeans,300,33833558.37637398,-3575904.0,2888.795,0.37564584612846375,781.5426635742188,0.7905502894819209,True,2,3
|
||||
19,3,full,0.0001,5,kmeans,100,26462676.376373976,-10946786.0,4205.024,0.2200196236371994,918.8184814453125,1.8325651201939497,True,2,3
|
||||
20,3,full,0.0001,5,kmeans,300,26462676.376373976,-10946786.0,4205.024,0.2200196236371994,918.8184814453125,1.8325651201939497,True,2,3
|
||||
21,3,full,0.001,1,kmeans,100,37452100.37637398,42638.0,2242.6267,0.37564584612846375,781.5426635742188,0.7905502894819209,True,2,3
|
||||
22,3,full,0.001,1,kmeans,300,37452100.37637398,42638.0,2242.6267,0.37564584612846375,781.5426635742188,0.7905502894819209,True,2,3
|
||||
23,3,full,0.001,5,kmeans,100,33411843.376373976,-3997619.0,2964.1013,0.2200196236371994,918.8184814453125,1.8325651201939497,True,2,3
|
||||
24,3,full,0.001,5,kmeans,300,33411843.376373976,-3997619.0,2964.1013,0.2200196236371994,918.8184814453125,1.8325651201939497,True,2,3
|
||||
25,3,full,0.001,1,k-means++,100,37089716.37637398,-319746.0,2307.3381,0.3671606779098511,710.0538940429688,1.7711288790751016,True,3,3
|
||||
26,3,full,0.001,1,k-means++,300,37089716.37637398,-319746.0,2307.3381,0.3671606779098511,710.0538940429688,1.7711288790751016,True,3,3
|
||||
27,3,full,0.001,5,k-means++,100,33366158.376373976,-4043304.0,2972.2593,0.19688381254673004,834.6746215820312,2.265938022603405,True,3,3
|
||||
28,3,full,0.001,5,k-means++,300,33366158.376373976,-4043304.0,2972.2593,0.19688381254673004,834.6746215820312,2.265938022603405,True,3,3
|
||||
29,4,full,0.0001,1,kmeans,100,44240183.14762353,-5639102.0,4007.3035,0.310958594083786,829.0513916015625,1.0906616124036408,True,2,4
|
||||
30,4,full,0.0001,1,kmeans,300,44240183.14762353,-5639102.0,4007.3035,0.310958594083786,829.0513916015625,1.0906616124036408,True,2,4
|
||||
31,4,full,0.0001,5,kmeans,100,38759701.14762353,-11119584.0,4985.961,0.2639731168746948,873.2352905273438,1.6723937598752374,True,2,4
|
||||
32,4,full,0.0001,5,kmeans,300,38759701.14762353,-11119584.0,4985.961,0.2639731168746948,873.2352905273438,1.6723937598752374,True,2,4
|
||||
33,4,full,0.001,1,kmeans,100,50532988.14762353,653703.0,2883.5884,0.310958594083786,829.0513916015625,1.0906616124036408,True,2,4
|
||||
34,4,full,0.001,1,kmeans,300,50532988.14762353,653703.0,2883.5884,0.310958594083786,829.0513916015625,1.0906616124036408,True,2,4
|
||||
35,4,full,0.001,5,kmeans,100,47456739.14762353,-2422546.0,3432.9185,0.2639731168746948,873.2352905273438,1.6723937598752374,True,2,4
|
||||
36,4,full,0.001,5,kmeans,300,47456739.14762353,-2422546.0,3432.9185,0.2639731168746948,873.2352905273438,1.6723937598752374,True,2,4
|
||||
37,4,full,0.001,1,k-means++,100,50369343.14762353,490058.0,2912.8108,0.16492997109889984,579.3992309570312,2.14603204385876,True,3,4
|
||||
38,4,full,0.001,1,k-means++,300,50369343.14762353,490058.0,2912.8108,0.16492997109889984,579.3992309570312,2.14603204385876,True,3,4
|
||||
39,4,full,0.001,5,k-means++,100,48104059.14762353,-1775226.0,3317.3257,0.19116489589214325,729.421630859375,2.391271825318095,True,3,4
|
||||
40,4,full,0.001,5,k-means++,300,48104059.14762353,-1775226.0,3317.3257,0.19116489589214325,729.421630859375,2.391271825318095,True,3,4
|
||||
41,5,full,0.0001,1,kmeans,100,60034171.91887309,-2314936.0,4163.7827,0.3162730038166046,780.908935546875,1.0143329238161003,True,2,5
|
||||
42,5,full,0.0001,1,kmeans,300,60034171.91887309,-2314936.0,4163.7827,0.3162730038166046,780.908935546875,1.0143329238161003,True,2,5
|
||||
43,5,full,0.0001,5,kmeans,100,54230057.91887309,-8119050.0,5200.232,0.25592249631881714,721.9691162109375,1.585169064053077,True,2,5
|
||||
44,5,full,0.0001,5,kmeans,300,54230057.91887309,-8119050.0,5200.232,0.25592249631881714,721.9691162109375,1.585169064053077,True,2,5
|
||||
45,5,full,0.001,1,kmeans,100,66698922.91887309,4349815.0,2973.6487,0.3162730038166046,780.908935546875,1.0143329238161003,True,2,5
|
||||
46,5,full,0.001,1,kmeans,300,66698922.91887309,4349815.0,2973.6487,0.3162730038166046,780.908935546875,1.0143329238161003,True,2,5
|
||||
47,5,full,0.001,5,kmeans,100,63375271.91887309,1026164.0,3567.158,0.25592249631881714,721.9691162109375,1.585169064053077,True,2,5
|
||||
48,5,full,0.001,5,kmeans,300,63375271.91887309,1026164.0,3567.158,0.25592249631881714,721.9691162109375,1.585169064053077,True,2,5
|
||||
49,5,full,0.001,1,k-means++,100,66517075.91887309,4167968.0,3006.1213,0.16880138218402863,549.265625,1.8881643569801063,True,3,5
|
||||
50,5,full,0.001,1,k-means++,300,66517075.91887309,4167968.0,3006.1213,0.16880138218402863,549.265625,1.8881643569801063,True,3,5
|
||||
51,5,full,0.001,5,k-means++,100,63364071.91887309,1014964.0,3569.158,0.25286975502967834,715.984619140625,1.6093410042807197,True,3,5
|
||||
52,5,full,0.001,5,k-means++,300,63364071.91887309,1014964.0,3569.158,0.25286975502967834,715.984619140625,1.6093410042807197,True,3,5
|
||||
53,6,full,0.0001,1,kmeans,100,73062550.69012265,-1756380.0,4814.121,0.24853873252868652,668.8661499023438,1.646429379523011,True,2,6
|
||||
54,6,full,0.0001,1,kmeans,300,73062550.69012265,-1756380.0,4814.121,0.24853873252868652,668.8661499023438,1.646429379523011,True,2,6
|
||||
55,6,full,0.0001,5,kmeans,100,69890932.69012265,-4927998.0,5380.4814,0.27074411511421204,655.6273193359375,1.6294192539951549,True,2,6
|
||||
56,6,full,0.0001,5,kmeans,300,69890932.69012265,-4927998.0,5380.4814,0.27074411511421204,655.6273193359375,1.6294192539951549,True,2,6
|
||||
57,6,full,0.001,1,kmeans,100,81179056.69012265,6360126.0,3364.745,0.24853873252868652,668.8661499023438,1.646429379523011,True,2,6
|
||||
58,6,full,0.001,1,kmeans,300,81179056.69012265,6360126.0,3364.745,0.24853873252868652,668.8661499023438,1.646429379523011,True,2,6
|
||||
59,6,full,0.001,5,kmeans,100,79356620.69012265,4537690.0,3690.18,0.27074411511421204,655.6273193359375,1.6294192539951549,True,2,6
|
||||
60,6,full,0.001,5,kmeans,300,79356620.69012265,4537690.0,3690.18,0.27074411511421204,655.6273193359375,1.6294192539951549,True,2,6
|
||||
61,6,full,1e-05,1,k-means++,100,68753328.69012265,-6065602.0,5583.625,0.17110876739025116,480.8229675292969,1.5572656008570327,True,3,6
|
||||
62,6,full,1e-05,1,k-means++,300,68753328.69012265,-6065602.0,5583.625,0.17110876739025116,480.8229675292969,1.5572656008570327,True,3,6
|
||||
63,6,full,0.0001,1,k-means++,100,75948732.69012265,1129802.0,4298.7314,0.17110876739025116,480.8229675292969,1.5572656008570327,True,3,6
|
||||
64,6,full,0.0001,1,k-means++,300,75948732.69012265,1129802.0,4298.7314,0.17110876739025116,480.8229675292969,1.5572656008570327,True,3,6
|
||||
65,6,full,0.0001,5,k-means++,100,69381502.69012265,-5437428.0,5471.451,0.21281521022319794,580.2518920898438,2.180165862436555,True,3,6
|
||||
66,6,full,0.0001,5,k-means++,300,69381502.69012265,-5437428.0,5471.451,0.21281521022319794,580.2518920898438,2.180165862436555,True,3,6
|
||||
67,6,full,0.001,1,k-means++,100,83185656.69012265,8366726.0,3006.4236,0.17110876739025116,480.8229675292969,1.5572656008570327,True,3,6
|
||||
68,6,full,0.001,1,k-means++,300,83185656.69012265,8366726.0,3006.4236,0.17110876739025116,480.8229675292969,1.5572656008570327,True,3,6
|
||||
69,6,full,0.001,5,k-means++,100,79079084.69012265,4260154.0,3739.74,0.21281521022319794,580.2518920898438,2.180165862436555,True,3,6
|
||||
70,6,full,0.001,5,k-means++,300,79079084.69012265,4260154.0,3739.74,0.21281521022319794,580.2518920898438,2.180165862436555,True,3,6
|
||||
71,8,full,0.0001,1,kmeans,100,101322900.23262176,1564326.0,5721.299,0.2680038809776306,557.3214721679688,1.5652706740038278,True,2,8
|
||||
72,8,full,0.0001,1,kmeans,300,101322900.23262176,1564326.0,5721.299,0.2680038809776306,557.3214721679688,1.5652706740038278,True,2,8
|
||||
73,8,full,0.0001,5,kmeans,100,100810002.23262176,1051428.0,5812.8877,0.27481919527053833,642.9092407226562,1.6967101819134367,True,2,8
|
||||
74,8,full,0.0001,5,kmeans,300,100810002.23262176,1051428.0,5812.8877,0.27481919527053833,642.9092407226562,1.6967101819134367,True,2,8
|
||||
75,8,full,0.001,1,kmeans,100,111448618.23262176,11690044.0,3913.135,0.2680038809776306,557.3214721679688,1.5652706740038278,True,2,8
|
||||
76,8,full,0.001,1,kmeans,300,111448618.23262176,11690044.0,3913.135,0.2680038809776306,557.3214721679688,1.5652706740038278,True,2,8
|
||||
77,8,full,0.001,5,kmeans,100,111172686.23262176,11414112.0,3962.4087,0.27481919527053833,642.9092407226562,1.6967101819134367,True,2,8
|
||||
78,8,full,0.001,5,kmeans,300,111172686.23262176,11414112.0,3962.4087,0.27481919527053833,642.9092407226562,1.6967101819134367,True,2,8
|
||||
79,8,full,0.001,1,k-means++,100,111979964.23262176,12221390.0,3818.2517,0.2020130306482315,465.200927734375,1.9463124697846808,True,3,8
|
||||
80,8,full,0.001,1,k-means++,300,111979964.23262176,12221390.0,3818.2517,0.2020130306482315,465.200927734375,1.9463124697846808,True,3,8
|
||||
81,8,full,0.001,5,k-means++,100,111327662.23262176,11569088.0,3934.7344,0.2736768126487732,617.4371948242188,1.7398856934277325,True,3,8
|
||||
82,8,full,0.001,5,k-means++,300,111327662.23262176,11569088.0,3934.7344,0.2736768126487732,617.4371948242188,1.7398856934277325,True,3,8
|
||||
83,10,full,0.0001,1,kmeans,100,133265705.77512088,8567482.0,5970.8955,0.24388161301612854,576.40185546875,1.5385559411472558,True,2,10
|
||||
84,10,full,0.0001,1,kmeans,300,133265705.77512088,8567482.0,5970.8955,0.24388161301612854,576.40185546875,1.5385559411472558,True,2,10
|
||||
85,10,full,0.0001,5,kmeans,100,132892239.77512088,8194016.0,6037.586,0.28627628087997437,557.144775390625,1.6716653781194553,True,2,10
|
||||
86,10,full,0.0001,5,kmeans,300,132892239.77512088,8194016.0,6037.586,0.28627628087997437,557.144775390625,1.6716653781194553,True,2,10
|
||||
87,10,full,0.001,1,kmeans,100,143970687.77512088,19272464.0,4059.2915,0.24388161301612854,576.40185546875,1.5385559411472558,True,2,10
|
||||
88,10,full,0.001,1,kmeans,300,143970687.77512088,19272464.0,4059.2915,0.24388161301612854,576.40185546875,1.5385559411472558,True,2,10
|
||||
89,10,full,0.001,5,kmeans,100,143652495.77512088,18954272.0,4116.1113,0.28627628087997437,557.144775390625,1.6716653781194553,True,2,10
|
||||
90,10,full,0.001,5,kmeans,300,143652495.77512088,18954272.0,4116.1113,0.28627628087997437,557.144775390625,1.6716653781194553,True,2,10
|
||||
91,10,full,0.001,1,k-means++,100,144482919.77512088,19784696.0,3967.8215,0.17508849501609802,474.4588928222656,1.846488092509191,True,3,10
|
||||
92,10,full,0.001,1,k-means++,300,144482919.77512088,19784696.0,3967.8215,0.17508849501609802,474.4588928222656,1.846488092509191,True,3,10
|
||||
93,10,full,0.001,5,k-means++,100,144071547.77512088,19373324.0,4041.2808,0.22849640250205994,521.3035278320312,1.9523215129883376,True,3,10
|
||||
94,10,full,0.001,5,k-means++,300,144071547.77512088,19373324.0,4041.2808,0.22849640250205994,521.3035278320312,1.9523215129883376,True,3,10
|
||||
95,11,full,0.0001,1,kmeans,100,149128048.54637042,11960004.0,6115.1685,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
96,11,full,0.0001,1,kmeans,300,149128048.54637042,11960004.0,6115.1685,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
97,11,full,0.0001,5,kmeans,100,149128048.54637042,11960004.0,6115.1685,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
98,11,full,0.0001,5,kmeans,300,149128048.54637042,11960004.0,6115.1685,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
99,11,full,0.001,1,kmeans,100,160074600.54637042,22906556.0,4160.4272,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
100,11,full,0.001,1,kmeans,300,160074600.54637042,22906556.0,4160.4272,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
101,11,full,0.001,5,kmeans,100,160074600.54637042,22906556.0,4160.4272,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
102,11,full,0.001,5,kmeans,300,160074600.54637042,22906556.0,4160.4272,0.2577499449253082,598.6676635742188,1.5193188313170118,True,2,11
|
||||
103,11,full,0.001,1,k-means++,100,160636770.54637042,23468726.0,4060.0396,0.18649740517139435,485.63348388671875,1.8009971426865101,True,3,11
|
||||
104,11,full,0.001,1,k-means++,300,160636770.54637042,23468726.0,4060.0396,0.18649740517139435,485.63348388671875,1.8009971426865101,True,3,11
|
||||
105,11,full,0.001,5,k-means++,100,160636770.54637042,23468726.0,4060.0396,0.18649740517139435,485.63348388671875,1.8009971426865101,True,3,11
|
||||
106,11,full,0.001,5,k-means++,300,160636770.54637042,23468726.0,4060.0396,0.18649740517139435,485.63348388671875,1.8009971426865101,True,3,11
|
||||
107,14,full,0.0001,1,kmeans,100,198149922.8601191,23572408.0,6291.7656,0.21188320219516754,491.18792724609375,1.7082811638393387,True,2,14
|
||||
108,14,full,0.0001,1,kmeans,300,198149922.8601191,23572408.0,6291.7656,0.21188320219516754,491.18792724609375,1.7082811638393387,True,2,14
|
||||
109,14,full,0.0001,5,kmeans,100,197540314.8601191,22962800.0,6400.6245,0.20964229106903076,496.4472351074219,1.950038464238459,True,2,14
|
||||
110,14,full,0.0001,5,kmeans,300,197540314.8601191,22962800.0,6400.6245,0.20964229106903076,496.4472351074219,1.950038464238459,True,2,14
|
||||
111,14,full,0.001,1,kmeans,100,209401674.8601191,34824160.0,4282.5244,0.21188320219516754,491.18792724609375,1.7082811638393387,True,2,14
|
||||
112,14,full,0.001,1,kmeans,300,209401674.8601191,34824160.0,4282.5244,0.21188320219516754,491.18792724609375,1.7082811638393387,True,2,14
|
||||
113,14,full,0.001,5,kmeans,100,208994740.8601191,34417224.0,4355.191,0.20964229106903076,496.4472351074219,1.950038464238459,True,2,14
|
||||
114,14,full,0.001,5,kmeans,300,208994740.8601191,34417224.0,4355.191,0.20964229106903076,496.4472351074219,1.950038464238459,True,2,14
|
||||
115,14,full,0.0001,1,k-means++,100,197987434.8601191,23409920.0,6320.7812,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
116,14,full,0.0001,1,k-means++,300,197987434.8601191,23409920.0,6320.7812,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
117,14,full,0.0001,5,k-means++,100,197987434.8601191,23409920.0,6320.7812,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
118,14,full,0.0001,5,k-means++,300,197987434.8601191,23409920.0,6320.7812,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
119,14,full,0.001,1,k-means++,100,209335602.8601191,34758090.0,4294.3228,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
120,14,full,0.001,1,k-means++,300,209335602.8601191,34758090.0,4294.3228,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
121,14,full,0.001,5,k-means++,100,209335602.8601191,34758090.0,4294.3228,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
122,14,full,0.001,5,k-means++,300,209335602.8601191,34758090.0,4294.3228,0.16694776713848114,449.8548889160156,1.8331922544784534,True,3,14
|
||||
123,17,full,0.0001,1,kmeans,100,247173509.17386776,35186530.0,6468.057,0.1834743171930313,427.35272216796875,1.8956740098304399,True,2,17
|
||||
124,17,full,0.0001,1,kmeans,300,247173509.17386776,35186530.0,6468.057,0.1834743171930313,427.35272216796875,1.8956740098304399,True,2,17
|
||||
125,17,full,0.0001,5,kmeans,100,246850361.17386776,34863380.0,6525.762,0.2085043042898178,427.7763977050781,1.9588585142518828,True,2,17
|
||||
126,17,full,0.0001,5,kmeans,300,246850361.17386776,34863380.0,6525.762,0.2085043042898178,427.7763977050781,1.9588585142518828,True,2,17
|
||||
127,17,full,0.001,1,kmeans,100,258736973.17386776,46749990.0,4403.153,0.1834743171930313,427.35272216796875,1.8956740098304399,True,2,17
|
||||
128,17,full,0.001,1,kmeans,300,258736973.17386776,46749990.0,4403.153,0.1834743171930313,427.35272216796875,1.8956740098304399,True,2,17
|
||||
129,17,full,0.001,5,kmeans,100,258504801.17386776,46517820.0,4444.6123,0.2085043042898178,427.7763977050781,1.9588585142518828,True,2,17
|
||||
130,17,full,0.001,5,kmeans,300,258504801.17386776,46517820.0,4444.6123,0.2085043042898178,427.7763977050781,1.9588585142518828,True,2,17
|
||||
131,17,full,0.0001,1,k-means++,100,247607397.17386776,35620416.0,6390.577,0.14455115795135498,384.99053955078125,2.108500185002096,True,3,17
|
||||
132,17,full,0.0001,1,k-means++,300,247607397.17386776,35620416.0,6390.577,0.14455115795135498,384.99053955078125,2.108500185002096,True,3,17
|
||||
133,17,full,0.0001,5,k-means++,100,246784997.17386776,34798016.0,6537.434,0.13458234071731567,386.3608093261719,2.3614049227531075,True,3,17
|
||||
134,17,full,0.0001,5,k-means++,300,246784997.17386776,34798016.0,6537.434,0.13458234071731567,386.3608093261719,2.3614049227531075,True,3,17
|
||||
135,17,full,0.001,1,k-means++,100,259055585.17386776,47068604.0,4346.258,0.14455115795135498,384.99053955078125,2.108500185002096,True,3,17
|
||||
136,17,full,0.001,1,k-means++,300,259055585.17386776,47068604.0,4346.258,0.14455115795135498,384.99053955078125,2.108500185002096,True,3,17
|
||||
137,17,full,0.001,5,k-means++,100,258522869.17386776,46535890.0,4441.3857,0.13458234071731567,386.3608093261719,2.3614049227531075,True,3,17
|
||||
138,17,full,0.001,5,k-means++,300,258522869.17386776,46535890.0,4441.3857,0.13458234071731567,386.3608093261719,2.3614049227531075,True,3,17
|
||||
139,20,full,0.0001,1,kmeans,100,296473639.4876164,47077190.0,6594.966,0.1770476996898651,382.437744140625,1.8608292401058428,True,2,20
|
||||
140,20,full,0.0001,1,kmeans,300,296473639.4876164,47077190.0,6594.966,0.1770476996898651,382.437744140625,1.8608292401058428,True,2,20
|
||||
141,20,full,0.001,1,kmeans,100,308235301.4876164,58838856.0,4494.669,0.1770476996898651,382.437744140625,1.8608292401058428,True,2,20
|
||||
142,20,full,0.001,1,kmeans,300,308235301.4876164,58838856.0,4494.669,0.1770476996898651,382.437744140625,1.8608292401058428,True,2,20
|
||||
143,20,full,0.001,5,kmeans,100,307947927.4876164,58551480.0,4545.986,0.12884767353534698,377.9795227050781,2.0180962938149367,True,2,20
|
||||
144,20,full,0.001,5,kmeans,300,307947927.4876164,58551480.0,4545.986,0.12884767353534698,377.9795227050781,2.0180962938149367,True,2,20
|
||||
145,20,full,0.0001,1,k-means++,100,297139767.4876164,47743320.0,6476.014,0.13996723294258118,336.5575866699219,2.2953358196957456,True,3,20
|
||||
146,20,full,0.0001,1,k-means++,300,297139767.4876164,47743320.0,6476.014,0.13996723294258118,336.5575866699219,2.2953358196957456,True,3,20
|
||||
147,20,full,0.001,1,k-means++,100,308712155.4876164,59315708.0,4409.5166,0.13996723294258118,336.5575866699219,2.2953358196957456,True,3,20
|
||||
148,20,full,0.001,1,k-means++,300,308712155.4876164,59315708.0,4409.5166,0.13996723294258118,336.5575866699219,2.2953358196957456,True,3,20
|
||||
149,20,full,0.001,5,k-means++,100,308599855.4876164,59203410.0,4429.57,0.15204866230487823,341.3536376953125,2.231048217195437,True,3,20
|
||||
150,20,full,0.001,5,k-means++,300,308599855.4876164,59203410.0,4429.57,0.15204866230487823,341.3536376953125,2.231048217195437,True,3,20
|
||||
151,2,diag,1e-05,1,kmeans,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
152,2,diag,1e-05,1,kmeans,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
153,2,diag,1e-05,5,kmeans,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
154,2,diag,1e-05,5,kmeans,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
155,2,diag,0.0001,1,kmeans,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
156,2,diag,0.0001,1,kmeans,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
157,2,diag,0.0001,5,kmeans,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
158,2,diag,0.0001,5,kmeans,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
159,2,diag,0.001,1,kmeans,100,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
160,2,diag,0.001,1,kmeans,300,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
161,2,diag,0.001,5,kmeans,100,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
162,2,diag,0.001,5,kmeans,300,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,4,2
|
||||
163,2,diag,1e-05,1,k-means++,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,6,2
|
||||
164,2,diag,1e-05,1,k-means++,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,6,2
|
||||
165,2,diag,1e-05,5,k-means++,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,6,2
|
||||
166,2,diag,1e-05,5,k-means++,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,6,2
|
||||
167,2,diag,0.0001,1,k-means++,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
168,2,diag,0.0001,1,k-means++,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
169,2,diag,0.0001,5,k-means++,100,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
170,2,diag,0.0001,5,k-means++,300,13089173.910885666,13040529.0,-2325.7397,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
171,2,diag,0.001,1,k-means++,100,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
172,2,diag,0.001,1,k-means++,300,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
173,2,diag,0.001,5,k-means++,100,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
174,2,diag,0.001,5,k-means++,300,13089203.910885666,13040559.0,-2325.745,0.36971479654312134,1327.397216796875,1.073152783729392,True,5,2
|
||||
175,3,diag,1e-05,1,kmeans,100,12693850.335015846,12620880.0,-2249.3394,0.3760926127433777,779.2965087890625,0.7860265546274455,True,6,3
|
||||
176,3,diag,1e-05,1,kmeans,300,12693850.335015846,12620880.0,-2249.3394,0.3760926127433777,779.2965087890625,0.7860265546274455,True,6,3
|
||||
177,3,diag,1e-05,5,kmeans,100,11770626.335015846,11697656.0,-2084.4778,0.1531982570886612,837.4287719726562,1.6999940518251055,True,19,3
|
||||
178,3,diag,1e-05,5,kmeans,300,11770626.335015846,11697656.0,-2084.4778,0.1531982570886612,837.4287719726562,1.6999940518251055,True,19,3
|
||||
179,3,diag,0.0001,1,kmeans,100,12699627.335015846,12626657.0,-2250.3708,0.3760926127433777,779.2965087890625,0.7860265546274455,True,6,3
|
||||
180,3,diag,0.0001,1,kmeans,300,12699627.335015846,12626657.0,-2250.3708,0.3760926127433777,779.2965087890625,0.7860265546274455,True,6,3
|
||||
181,3,diag,0.0001,5,kmeans,100,11770626.335015846,11697656.0,-2084.4778,0.1531982570886612,837.4287719726562,1.6999940518251055,True,20,3
|
||||
182,3,diag,0.0001,5,kmeans,300,11770626.335015846,11697656.0,-2084.4778,0.1531982570886612,837.4287719726562,1.6999940518251055,True,20,3
|
||||
183,3,diag,0.001,1,kmeans,100,12718245.335015846,12645275.0,-2253.6956,0.3760926127433777,779.2965087890625,0.7860265546274455,True,7,3
|
||||
184,3,diag,0.001,1,kmeans,300,12718245.335015846,12645275.0,-2253.6956,0.3760926127433777,779.2965087890625,0.7860265546274455,True,7,3
|
||||
185,3,diag,0.001,5,kmeans,100,11770859.335015846,11697889.0,-2084.5195,0.15369778871536255,838.0372924804688,1.7007544548321498,True,19,3
|
||||
186,3,diag,0.001,5,kmeans,300,11770859.335015846,11697889.0,-2084.5195,0.15369778871536255,838.0372924804688,1.7007544548321498,True,19,3
|
||||
187,3,diag,1e-05,1,k-means++,100,11686081.335015846,11613111.0,-2069.3806,0.2351498007774353,882.5064086914062,2.071681816869212,True,19,3
|
||||
188,3,diag,1e-05,1,k-means++,300,11686081.335015846,11613111.0,-2069.3806,0.2351498007774353,882.5064086914062,2.071681816869212,True,19,3
|
||||
189,3,diag,1e-05,5,k-means++,100,11686081.335015846,11613111.0,-2069.3806,0.2351498007774353,882.5064086914062,2.071681816869212,True,19,3
|
||||
190,3,diag,1e-05,5,k-means++,300,11686081.335015846,11613111.0,-2069.3806,0.2351498007774353,882.5064086914062,2.071681816869212,True,19,3
|
||||
191,3,diag,0.0001,1,k-means++,100,11686083.335015846,11613113.0,-2069.3809,0.2351498007774353,882.5064086914062,2.071681816869212,True,21,3
|
||||
192,3,diag,0.0001,1,k-means++,300,11686083.335015846,11613113.0,-2069.3809,0.2351498007774353,882.5064086914062,2.071681816869212,True,21,3
|
||||
193,3,diag,0.0001,5,k-means++,100,11686083.335015846,11613113.0,-2069.3809,0.2351498007774353,882.5064086914062,2.071681816869212,True,21,3
|
||||
194,3,diag,0.0001,5,k-means++,300,11686083.335015846,11613113.0,-2069.3809,0.2351498007774353,882.5064086914062,2.071681816869212,True,21,3
|
||||
195,3,diag,0.001,1,k-means++,100,11686162.335015846,11613192.0,-2069.395,0.2351498007774353,882.5064086914062,2.071681816869212,True,22,3
|
||||
196,3,diag,0.001,1,k-means++,300,11686162.335015846,11613192.0,-2069.395,0.2351498007774353,882.5064086914062,2.071681816869212,True,22,3
|
||||
197,3,diag,0.001,5,k-means++,100,11686154.335015846,11613184.0,-2069.3936,0.2351498007774353,882.5064697265625,2.071681816869212,True,13,3
|
||||
198,3,diag,0.001,5,k-means++,300,11686154.335015846,11613184.0,-2069.3936,0.2351498007774353,882.5064697265625,2.071681816869212,True,13,3
|
||||
199,4,diag,1e-05,1,kmeans,100,11525150.759146027,11427855.0,-2034.8359,0.3090108335018158,828.0037841796875,1.0965690514458653,True,4,4
|
||||
200,4,diag,1e-05,1,kmeans,300,11525150.759146027,11427855.0,-2034.8359,0.3090108335018158,828.0037841796875,1.0965690514458653,True,4,4
|
||||
201,4,diag,1e-05,5,kmeans,100,10872145.759146027,10774850.0,-1918.2279,0.22238799929618835,665.7476806640625,2.503304023275595,True,17,4
|
||||
202,4,diag,1e-05,5,kmeans,300,10872145.759146027,10774850.0,-1918.2279,0.22238799929618835,665.7476806640625,2.503304023275595,True,17,4
|
||||
203,4,diag,0.0001,1,kmeans,100,11530927.759146027,11433632.0,-2035.8676,0.3090108335018158,828.0037841796875,1.0965690514458653,True,4,4
|
||||
204,4,diag,0.0001,1,kmeans,300,11530927.759146027,11433632.0,-2035.8676,0.3090108335018158,828.0037841796875,1.0965690514458653,True,4,4
|
||||
205,4,diag,0.0001,5,kmeans,100,10872147.759146027,10774852.0,-1918.2283,0.22238799929618835,665.7476806640625,2.503304023275595,True,16,4
|
||||
206,4,diag,0.0001,5,kmeans,300,10872147.759146027,10774852.0,-1918.2283,0.22238799929618835,665.7476806640625,2.503304023275595,True,16,4
|
||||
207,4,diag,0.001,1,kmeans,100,11549555.759146027,11452260.0,-2039.194,0.3090108335018158,828.0037841796875,1.0965690514458653,True,4,4
|
||||
208,4,diag,0.001,1,kmeans,300,11549555.759146027,11452260.0,-2039.194,0.3090108335018158,828.0037841796875,1.0965690514458653,True,4,4
|
||||
209,4,diag,0.001,5,kmeans,100,10872103.759146027,10774808.0,-1918.2203,0.2233457714319229,667.0189208984375,2.500263810780557,True,18,4
|
||||
210,4,diag,0.001,5,kmeans,300,10872103.759146027,10774808.0,-1918.2203,0.2233457714319229,667.0189208984375,2.500263810780557,True,18,4
|
||||
211,4,diag,1e-05,1,k-means++,100,10871971.759146027,10774676.0,-1918.1968,0.22509750723838806,669.1204223632812,2.4944239618747446,True,17,4
|
||||
212,4,diag,1e-05,1,k-means++,300,10871971.759146027,10774676.0,-1918.1968,0.22509750723838806,669.1204223632812,2.4944239618747446,True,17,4
|
||||
213,4,diag,1e-05,5,k-means++,100,10865268.759146027,10767973.0,-1916.9999,0.18924137949943542,715.3873291015625,1.9068310882928445,True,23,4
|
||||
214,4,diag,1e-05,5,k-means++,300,10865268.759146027,10767973.0,-1916.9999,0.18924137949943542,715.3873291015625,1.9068310882928445,True,23,4
|
||||
215,4,diag,0.0001,1,k-means++,100,11379467.759146027,11282172.0,-2008.821,0.23593758046627045,682.464599609375,1.5970337323460135,True,12,4
|
||||
216,4,diag,0.0001,1,k-means++,300,11379467.759146027,11282172.0,-2008.821,0.23593758046627045,682.464599609375,1.5970337323460135,True,12,4
|
||||
217,4,diag,0.0001,5,k-means++,100,10872147.759146027,10774852.0,-1918.2283,0.22238799929618835,665.7476806640625,2.503304023275595,True,26,4
|
||||
218,4,diag,0.0001,5,k-means++,300,10872147.759146027,10774852.0,-1918.2283,0.22238799929618835,665.7476806640625,2.503304023275595,True,26,4
|
||||
219,4,diag,0.001,1,k-means++,100,11398228.759146027,11300933.0,-2012.1713,0.23644769191741943,683.3140869140625,1.5953322750132166,True,13,4
|
||||
220,4,diag,0.001,1,k-means++,300,11398228.759146027,11300933.0,-2012.1713,0.23644769191741943,683.3140869140625,1.5953322750132166,True,13,4
|
||||
221,4,diag,0.001,5,k-means++,100,10872234.759146027,10774939.0,-1918.2438,0.22265465557575226,666.09033203125,2.5026321909350457,True,24,4
|
||||
222,4,diag,0.001,5,k-means++,300,10872234.759146027,10774939.0,-1918.2438,0.22265465557575226,666.09033203125,2.5026321909350457,True,24,4
|
||||
223,5,diag,1e-05,1,kmeans,100,10641753.183276208,10520132.0,-1871.2793,0.3118983507156372,774.8809814453125,1.0195520325095044,True,5,5
|
||||
224,5,diag,1e-05,1,kmeans,300,10641753.183276208,10520132.0,-1871.2793,0.3118983507156372,774.8809814453125,1.0195520325095044,True,5,5
|
||||
225,5,diag,1e-05,5,kmeans,100,10324953.183276208,10203332.0,-1814.7079,0.21296893060207367,639.6068115234375,1.5943357881476847,True,20,5
|
||||
226,5,diag,1e-05,5,kmeans,300,10324953.183276208,10203332.0,-1814.7079,0.21296893060207367,639.6068115234375,1.5943357881476847,True,20,5
|
||||
227,5,diag,0.0001,1,kmeans,100,10647529.183276208,10525908.0,-1872.3107,0.3118983507156372,774.8809814453125,1.0195520325095044,True,5,5
|
||||
228,5,diag,0.0001,1,kmeans,300,10647529.183276208,10525908.0,-1872.3107,0.3118983507156372,774.8809814453125,1.0195520325095044,True,5,5
|
||||
229,5,diag,0.0001,5,kmeans,100,10324954.183276208,10203333.0,-1814.708,0.21296893060207367,639.6068115234375,1.5943357881476847,True,20,5
|
||||
230,5,diag,0.0001,5,kmeans,300,10324954.183276208,10203333.0,-1814.708,0.21296893060207367,639.6068115234375,1.5943357881476847,True,20,5
|
||||
231,5,diag,0.001,1,kmeans,100,10666196.183276208,10544575.0,-1875.6442,0.3118983507156372,774.8809814453125,1.0195520325095044,True,5,5
|
||||
232,5,diag,0.001,1,kmeans,300,10666196.183276208,10544575.0,-1875.6442,0.3118983507156372,774.8809814453125,1.0195520325095044,True,5,5
|
||||
233,5,diag,0.001,5,kmeans,100,10327782.183276208,10206161.0,-1815.213,0.2155037522315979,645.0463256835938,1.5864905576853905,True,23,5
|
||||
234,5,diag,0.001,5,kmeans,300,10327782.183276208,10206161.0,-1815.213,0.2155037522315979,645.0463256835938,1.5864905576853905,True,23,5
|
||||
235,5,diag,1e-05,1,k-means++,100,9931250.183276208,9809629.0,-1744.4038,0.2225552350282669,602.316162109375,2.189639810006293,True,17,5
|
||||
236,5,diag,1e-05,1,k-means++,300,9931250.183276208,9809629.0,-1744.4038,0.2225552350282669,602.316162109375,2.189639810006293,True,17,5
|
||||
237,5,diag,1e-05,5,k-means++,100,9931250.183276208,9809629.0,-1744.4038,0.2225552350282669,602.316162109375,2.189639810006293,True,17,5
|
||||
238,5,diag,1e-05,5,k-means++,300,9931250.183276208,9809629.0,-1744.4038,0.2225552350282669,602.316162109375,2.189639810006293,True,17,5
|
||||
239,5,diag,0.0001,1,k-means++,100,10466296.183276208,10344675.0,-1839.9476,0.24029850959777832,631.74609375,1.4320268333089838,True,14,5
|
||||
240,5,diag,0.0001,1,k-means++,300,10466296.183276208,10344675.0,-1839.9476,0.24029850959777832,631.74609375,1.4320268333089838,True,14,5
|
||||
241,5,diag,0.0001,5,k-means++,100,9948819.183276208,9827198.0,-1747.541,0.19564315676689148,662.848876953125,1.6781877684435718,True,17,5
|
||||
242,5,diag,0.0001,5,k-means++,300,9948819.183276208,9827198.0,-1747.541,0.19564315676689148,662.848876953125,1.6781877684435718,True,17,5
|
||||
243,5,diag,0.001,1,k-means++,100,10485035.183276208,10363414.0,-1843.294,0.24058617651462555,631.865478515625,1.432154375281304,True,17,5
|
||||
244,5,diag,0.001,1,k-means++,300,10485035.183276208,10363414.0,-1843.294,0.24058617651462555,631.865478515625,1.432154375281304,True,17,5
|
||||
245,5,diag,0.001,5,k-means++,100,9949011.183276208,9827390.0,-1747.5753,0.19609026610851288,663.4921264648438,1.6774180513139325,True,19,5
|
||||
246,5,diag,0.001,5,k-means++,300,9949011.183276208,9827390.0,-1747.5753,0.19609026610851288,663.4921264648438,1.6774180513139325,True,19,5
|
||||
247,6,diag,1e-05,1,kmeans,100,9799004.60740639,9653058.0,-1714.9814,0.26129186153411865,629.4482421875,1.9915981688116708,True,15,6
|
||||
248,6,diag,1e-05,1,kmeans,300,9799004.60740639,9653058.0,-1714.9814,0.26129186153411865,629.4482421875,1.9915981688116708,True,15,6
|
||||
249,6,diag,1e-05,5,kmeans,100,9102218.60740639,8956272.0,-1590.5553,0.21352295577526093,566.8534545898438,2.13510416879353,True,19,6
|
||||
250,6,diag,1e-05,5,kmeans,300,9102218.60740639,8956272.0,-1590.5553,0.21352295577526093,566.8534545898438,2.13510416879353,True,19,6
|
||||
251,6,diag,0.0001,1,kmeans,100,9804784.60740639,9658838.0,-1716.0135,0.26129186153411865,629.4482421875,1.9915981688116708,True,15,6
|
||||
252,6,diag,0.0001,1,kmeans,300,9804784.60740639,9658838.0,-1716.0135,0.26129186153411865,629.4482421875,1.9915981688116708,True,15,6
|
||||
253,6,diag,0.0001,5,kmeans,100,9102220.60740639,8956274.0,-1590.5557,0.21352295577526093,566.8534545898438,2.13510416879353,True,19,6
|
||||
254,6,diag,0.0001,5,kmeans,300,9102220.60740639,8956274.0,-1590.5557,0.21352295577526093,566.8534545898438,2.13510416879353,True,19,6
|
||||
255,6,diag,0.001,1,kmeans,100,9823540.60740639,9677594.0,-1719.3629,0.26235219836235046,629.54638671875,1.991657340604508,True,16,6
|
||||
256,6,diag,0.001,1,kmeans,300,9823540.60740639,9677594.0,-1719.3629,0.26235219836235046,629.54638671875,1.991657340604508,True,16,6
|
||||
257,6,diag,0.001,5,kmeans,100,9102520.60740639,8956574.0,-1590.6093,0.21336553990840912,567.1337890625,2.137562111985855,True,28,6
|
||||
258,6,diag,0.001,5,kmeans,300,9102520.60740639,8956574.0,-1590.6093,0.21336553990840912,567.1337890625,2.137562111985855,True,28,6
|
||||
259,6,diag,1e-05,1,k-means++,100,9646063.60740639,9500117.0,-1687.6705,0.20277422666549683,612.2411499023438,1.4227889323530194,True,20,6
|
||||
260,6,diag,1e-05,1,k-means++,300,9646063.60740639,9500117.0,-1687.6705,0.20277422666549683,612.2411499023438,1.4227889323530194,True,20,6
|
||||
261,6,diag,1e-05,5,k-means++,100,9102296.60740639,8956350.0,-1590.5693,0.21240639686584473,565.2139282226562,2.136853531967839,True,21,6
|
||||
262,6,diag,1e-05,5,k-means++,300,9102296.60740639,8956350.0,-1590.5693,0.21240639686584473,565.2139282226562,2.136853531967839,True,21,6
|
||||
263,6,diag,0.0001,1,k-means++,100,9682908.60740639,9536962.0,-1694.25,0.17942221462726593,527.6393432617188,1.9590089629656866,True,34,6
|
||||
264,6,diag,0.0001,1,k-means++,300,9682908.60740639,9536962.0,-1694.25,0.17942221462726593,527.6393432617188,1.9590089629656866,True,34,6
|
||||
265,6,diag,0.0001,5,k-means++,100,9102298.60740639,8956352.0,-1590.5697,0.21240639686584473,565.2139282226562,2.136853531967839,True,24,6
|
||||
266,6,diag,0.0001,5,k-means++,300,9102298.60740639,8956352.0,-1590.5697,0.21240639686584473,565.2139282226562,2.136853531967839,True,24,6
|
||||
267,6,diag,0.001,1,k-means++,100,9701922.60740639,9555976.0,-1697.6454,0.18013142049312592,529.0560913085938,1.9546049777626981,True,31,6
|
||||
268,6,diag,0.001,1,k-means++,300,9701922.60740639,9555976.0,-1697.6454,0.18013142049312592,529.0560913085938,1.9546049777626981,True,31,6
|
||||
269,6,diag,0.001,5,k-means++,100,9102520.60740639,8956574.0,-1590.6093,0.21336553990840912,567.1337890625,2.137562111985855,True,31,6
|
||||
270,6,diag,0.001,5,k-means++,300,9102520.60740639,8956574.0,-1590.6093,0.21336553990840912,567.1337890625,2.137562111985855,True,31,6
|
||||
271,8,diag,1e-05,1,kmeans,100,9403674.455666753,9209077.0,-1632.7727,0.25722020864486694,545.548095703125,1.7082735900456691,True,9,8
|
||||
272,8,diag,1e-05,1,kmeans,300,9403674.455666753,9209077.0,-1632.7727,0.25722020864486694,545.548095703125,1.7082735900456691,True,9,8
|
||||
273,8,diag,1e-05,5,kmeans,100,8401628.455666753,8207031.0,-1453.8359,0.24377639591693878,568.4773559570312,2.056540191275295,True,13,8
|
||||
274,8,diag,1e-05,5,kmeans,300,8401628.455666753,8207031.0,-1453.8359,0.24377639591693878,568.4773559570312,2.056540191275295,True,13,8
|
||||
275,8,diag,0.0001,1,kmeans,100,9409411.455666753,9214814.0,-1633.7971,0.25722020864486694,545.548095703125,1.7082735900456691,True,9,8
|
||||
276,8,diag,0.0001,1,kmeans,300,9409411.455666753,9214814.0,-1633.7971,0.25722020864486694,545.548095703125,1.7082735900456691,True,9,8
|
||||
277,8,diag,0.0001,5,kmeans,100,8401766.955666753,8207169.5,-1453.8606,0.24412217736244202,568.5999755859375,2.0555351393061194,True,13,8
|
||||
278,8,diag,0.0001,5,kmeans,300,8401766.955666753,8207169.5,-1453.8606,0.24412217736244202,568.5999755859375,2.0555351393061194,True,13,8
|
||||
279,8,diag,0.001,1,kmeans,100,9428169.455666753,9233572.0,-1637.1467,0.2572267949581146,545.4515991210938,1.707896480449314,True,12,8
|
||||
280,8,diag,0.001,1,kmeans,300,9428169.455666753,9233572.0,-1637.1467,0.2572267949581146,545.4515991210938,1.707896480449314,True,12,8
|
||||
281,8,diag,0.001,5,kmeans,100,8402030.455666753,8207433.0,-1453.9077,0.24412217736244202,568.5999755859375,2.0555351393061194,True,19,8
|
||||
282,8,diag,0.001,5,kmeans,300,8402030.455666753,8207433.0,-1453.9077,0.24412217736244202,568.5999755859375,2.0555351393061194,True,19,8
|
||||
283,8,diag,1e-05,1,k-means++,100,9222857.455666753,9028260.0,-1600.4839,0.20888392627239227,437.5425109863281,2.1599292696306343,True,22,8
|
||||
284,8,diag,1e-05,1,k-means++,300,9222857.455666753,9028260.0,-1600.4839,0.20888392627239227,437.5425109863281,2.1599292696306343,True,22,8
|
||||
285,8,diag,1e-05,5,k-means++,100,8425934.455666753,8231337.0,-1458.1763,0.240326389670372,491.6292724609375,2.0912175194979867,True,18,8
|
||||
286,8,diag,1e-05,5,k-means++,300,8425934.455666753,8231337.0,-1458.1763,0.240326389670372,491.6292724609375,2.0912175194979867,True,18,8
|
||||
287,8,diag,0.0001,1,k-means++,100,9489607.455666753,9295010.0,-1648.1178,0.214387446641922,498.1647033691406,1.8502738691794258,True,15,8
|
||||
288,8,diag,0.0001,1,k-means++,300,9489607.455666753,9295010.0,-1648.1178,0.214387446641922,498.1647033691406,1.8502738691794258,True,15,8
|
||||
289,8,diag,0.0001,5,k-means++,100,8401862.455666753,8207265.0,-1453.8777,0.24387237429618835,568.502197265625,2.056662610079275,True,14,8
|
||||
290,8,diag,0.0001,5,k-means++,300,8401862.455666753,8207265.0,-1453.8777,0.24387237429618835,568.502197265625,2.056662610079275,True,14,8
|
||||
291,8,diag,0.001,1,k-means++,100,9508329.455666753,9313732.0,-1651.461,0.21473833918571472,498.2613830566406,1.850583105515141,True,13,8
|
||||
292,8,diag,0.001,1,k-means++,300,9508329.455666753,9313732.0,-1651.461,0.21473833918571472,498.2613830566406,1.850583105515141,True,13,8
|
||||
293,8,diag,0.001,5,k-means++,100,8402078.455666753,8207481.0,-1453.9163,0.2442118227481842,568.6036987304688,2.0567161321422094,True,17,8
|
||||
294,8,diag,0.001,5,k-means++,300,8402078.455666753,8207481.0,-1453.9163,0.2442118227481842,568.6036987304688,2.0567161321422094,True,17,8
|
||||
295,10,diag,1e-05,1,kmeans,100,7888954.303927114,7645706.0,-1350.6729,0.20840129256248474,479.23870849609375,1.9209118556333535,True,40,10
|
||||
296,10,diag,1e-05,1,kmeans,300,7888954.303927114,7645706.0,-1350.6729,0.20840129256248474,479.23870849609375,1.9209118556333535,True,40,10
|
||||
297,10,diag,1e-05,5,kmeans,100,7737961.303927114,7494713.0,-1323.7098,0.21908442676067352,571.03955078125,1.8024606257110887,True,14,10
|
||||
298,10,diag,1e-05,5,kmeans,300,7737961.303927114,7494713.0,-1323.7098,0.21908442676067352,571.03955078125,1.8024606257110887,True,14,10
|
||||
299,10,diag,0.0001,1,kmeans,100,7894699.303927114,7651451.0,-1351.6987,0.20837311446666718,479.2680969238281,1.921187996397482,True,39,10
|
||||
300,10,diag,0.0001,1,kmeans,300,7894699.303927114,7651451.0,-1351.6987,0.20837311446666718,479.2680969238281,1.921187996397482,True,39,10
|
||||
301,10,diag,0.0001,5,kmeans,100,7743709.303927114,7500461.0,-1324.7362,0.21908442676067352,571.03955078125,1.8024606257110887,True,15,10
|
||||
302,10,diag,0.0001,5,kmeans,300,7743709.303927114,7500461.0,-1324.7362,0.21908442676067352,571.03955078125,1.8024606257110887,True,15,10
|
||||
303,10,diag,0.001,1,kmeans,100,7913842.303927114,7670594.0,-1355.1172,0.20897234976291656,479.4935302734375,1.9199507888474652,True,32,10
|
||||
304,10,diag,0.001,1,kmeans,300,7913842.303927114,7670594.0,-1355.1172,0.20897234976291656,479.4935302734375,1.9199507888474652,True,32,10
|
||||
305,10,diag,0.001,5,kmeans,100,7762934.303927114,7519686.0,-1328.1693,0.2199474722146988,571.3079833984375,1.801659072375885,True,12,10
|
||||
306,10,diag,0.001,5,kmeans,300,7762934.303927114,7519686.0,-1328.1693,0.2199474722146988,571.3079833984375,1.801659072375885,True,12,10
|
||||
307,10,diag,1e-05,1,k-means++,100,7924116.803927114,7680868.5,-1356.9519,0.18109233677387238,451.2889099121094,2.0573889908400558,True,21,10
|
||||
308,10,diag,1e-05,1,k-means++,300,7924116.803927114,7680868.5,-1356.9519,0.18109233677387238,451.2889099121094,2.0573889908400558,True,21,10
|
||||
309,10,diag,1e-05,5,k-means++,100,7738147.803927114,7494899.5,-1323.7432,0.2197750359773636,571.7454223632812,1.7996143618833955,True,15,10
|
||||
310,10,diag,1e-05,5,k-means++,300,7738147.803927114,7494899.5,-1323.7432,0.2197750359773636,571.7454223632812,1.7996143618833955,True,15,10
|
||||
311,10,diag,0.0001,1,k-means++,100,8187378.303927114,7944130.0,-1403.9629,0.18437595665454865,504.9218444824219,1.8422731699234043,True,15,10
|
||||
312,10,diag,0.0001,1,k-means++,300,8187378.303927114,7944130.0,-1403.9629,0.18437595665454865,504.9218444824219,1.8422731699234043,True,15,10
|
||||
313,10,diag,0.0001,5,k-means++,100,7743709.303927114,7500461.0,-1324.7362,0.21908442676067352,571.03955078125,1.8024606257110887,True,19,10
|
||||
314,10,diag,0.0001,5,k-means++,300,7743709.303927114,7500461.0,-1324.7362,0.21908442676067352,571.03955078125,1.8024606257110887,True,19,10
|
||||
315,10,diag,0.001,1,k-means++,100,8206320.303927114,7963072.0,-1407.3453,0.18497711420059204,505.0896301269531,1.8418734003624215,True,12,10
|
||||
316,10,diag,0.001,1,k-means++,300,8206320.303927114,7963072.0,-1407.3453,0.18497711420059204,505.0896301269531,1.8418734003624215,True,12,10
|
||||
317,10,diag,0.001,5,k-means++,100,7762911.303927114,7519663.0,-1328.1652,0.21994036436080933,571.3048706054688,1.8012326594817665,True,14,10
|
||||
318,10,diag,0.001,5,k-means++,300,7762911.303927114,7519663.0,-1328.1652,0.21994036436080933,571.3048706054688,1.8012326594817665,True,14,10
|
||||
319,11,diag,1e-05,1,kmeans,100,7579813.728057295,7312240.0,-1289.6621,0.23397988080978394,564.2086791992188,1.7291402394614084,True,11,11
|
||||
320,11,diag,1e-05,1,kmeans,300,7579813.728057295,7312240.0,-1289.6621,0.23397988080978394,564.2086791992188,1.7291402394614084,True,11,11
|
||||
321,11,diag,1e-05,5,kmeans,100,7579813.728057295,7312240.0,-1289.6621,0.23397988080978394,564.2086791992188,1.7291402394614084,True,11,11
|
||||
322,11,diag,1e-05,5,kmeans,300,7579813.728057295,7312240.0,-1289.6621,0.23397988080978394,564.2086791992188,1.7291402394614084,True,11,11
|
||||
323,11,diag,0.0001,1,kmeans,100,7585561.228057295,7317987.5,-1290.6885,0.23397988080978394,564.2086791992188,1.7291402394614084,True,13,11
|
||||
324,11,diag,0.0001,1,kmeans,300,7585561.228057295,7317987.5,-1290.6885,0.23397988080978394,564.2086791992188,1.7291402394614084,True,13,11
|
||||
325,11,diag,0.0001,5,kmeans,100,7585561.228057295,7317987.5,-1290.6885,0.23397988080978394,564.2086791992188,1.7291402394614084,True,13,11
|
||||
326,11,diag,0.0001,5,kmeans,300,7585561.228057295,7317987.5,-1290.6885,0.23397988080978394,564.2086791992188,1.7291402394614084,True,13,11
|
||||
327,11,diag,0.001,1,kmeans,100,7604425.728057295,7336852.0,-1294.0571,0.2341206818819046,564.5838012695312,1.7292947793505737,True,23,11
|
||||
328,11,diag,0.001,1,kmeans,300,7604425.728057295,7336852.0,-1294.0571,0.2341206818819046,564.5838012695312,1.7292947793505737,True,23,11
|
||||
329,11,diag,0.001,5,kmeans,100,7604425.728057295,7336852.0,-1294.0571,0.2341206818819046,564.5838012695312,1.7292947793505737,True,23,11
|
||||
330,11,diag,0.001,5,kmeans,300,7604425.728057295,7336852.0,-1294.0571,0.2341206818819046,564.5838012695312,1.7292947793505737,True,23,11
|
||||
331,11,diag,1e-05,1,k-means++,100,7791839.228057295,7524265.5,-1327.5238,0.19072884321212769,512.7921752929688,1.8151284796024916,True,12,11
|
||||
332,11,diag,1e-05,1,k-means++,300,7791839.228057295,7524265.5,-1327.5238,0.19072884321212769,512.7921752929688,1.8151284796024916,True,12,11
|
||||
333,11,diag,1e-05,5,k-means++,100,7590459.228057295,7322885.5,-1291.5631,0.23250937461853027,557.0549926757812,1.6956773285989237,True,15,11
|
||||
334,11,diag,1e-05,5,k-means++,300,7590459.228057295,7322885.5,-1291.5631,0.23250937461853027,557.0549926757812,1.6956773285989237,True,15,11
|
||||
335,11,diag,0.0001,1,k-means++,100,7673894.728057295,7406321.0,-1306.4623,0.1943679302930832,520.76953125,1.7964734396608426,True,18,11
|
||||
336,11,diag,0.0001,1,k-means++,300,7673894.728057295,7406321.0,-1306.4623,0.1943679302930832,520.76953125,1.7964734396608426,True,18,11
|
||||
337,11,diag,0.0001,5,k-means++,100,7595473.228057295,7327899.5,-1292.4585,0.23414196074008942,558.0660400390625,1.7058883755528695,True,22,11
|
||||
338,11,diag,0.0001,5,k-means++,300,7595473.228057295,7327899.5,-1292.4585,0.23414196074008942,558.0660400390625,1.7058883755528695,True,22,11
|
||||
339,11,diag,0.001,1,k-means++,100,7694915.228057295,7427341.5,-1310.216,0.1941366195678711,518.650634765625,1.8003890716705255,True,19,11
|
||||
340,11,diag,0.001,1,k-means++,300,7694915.228057295,7427341.5,-1310.216,0.1941366195678711,518.650634765625,1.8003890716705255,True,19,11
|
||||
341,11,diag,0.001,5,k-means++,100,7603135.728057295,7335562.0,-1293.8268,0.23057277500629425,554.763427734375,1.7732846393460022,True,61,11
|
||||
342,11,diag,0.001,5,k-means++,300,7603135.728057295,7335562.0,-1293.8268,0.23057277500629425,554.763427734375,1.7732846393460022,True,61,11
|
||||
343,14,diag,1e-05,1,kmeans,100,7223238.000447838,6882688.0,-1208.5668,0.189774751663208,445.0321044921875,2.405689156966965,True,35,14
|
||||
344,14,diag,1e-05,1,kmeans,300,7223238.000447838,6882688.0,-1208.5668,0.189774751663208,445.0321044921875,2.405689156966965,True,35,14
|
||||
345,14,diag,1e-05,5,kmeans,100,7179637.000447838,6839087.0,-1200.7809,0.19881103932857513,443.0815124511719,2.3498009106287974,True,37,14
|
||||
346,14,diag,1e-05,5,kmeans,300,7179637.000447838,6839087.0,-1200.7809,0.19881103932857513,443.0815124511719,2.3498009106287974,True,37,14
|
||||
347,14,diag,0.0001,1,kmeans,100,7403717.000447838,7063167.0,-1240.7952,0.19351521134376526,467.3027038574219,1.8745209391776043,True,9,14
|
||||
348,14,diag,0.0001,1,kmeans,300,7403717.000447838,7063167.0,-1240.7952,0.19351521134376526,467.3027038574219,1.8745209391776043,True,9,14
|
||||
349,14,diag,0.0001,5,kmeans,100,7185399.000447838,6844849.0,-1201.8098,0.1989377737045288,443.1646423339844,2.3487992155318165,True,36,14
|
||||
350,14,diag,0.0001,5,kmeans,300,7185399.000447838,6844849.0,-1201.8098,0.1989377737045288,443.1646423339844,2.3487992155318165,True,36,14
|
||||
351,14,diag,0.001,1,kmeans,100,7249591.000447838,6909041.0,-1213.2727,0.1921805739402771,446.1286315917969,2.4012027743509816,True,32,14
|
||||
352,14,diag,0.001,1,kmeans,300,7249591.000447838,6909041.0,-1213.2727,0.1921805739402771,446.1286315917969,2.4012027743509816,True,32,14
|
||||
353,14,diag,0.001,5,kmeans,100,7206289.000447838,6865739.0,-1205.5402,0.2005954384803772,446.14300537109375,2.3618214198344596,True,35,14
|
||||
354,14,diag,0.001,5,kmeans,300,7206289.000447838,6865739.0,-1205.5402,0.2005954384803772,446.14300537109375,2.3618214198344596,True,35,14
|
||||
355,14,diag,1e-05,1,k-means++,100,7407422.000447838,7066872.0,-1241.4568,0.17794980108737946,461.5752868652344,1.8973642909602886,True,16,14
|
||||
356,14,diag,1e-05,1,k-means++,300,7407422.000447838,7066872.0,-1241.4568,0.17794980108737946,461.5752868652344,1.8973642909602886,True,16,14
|
||||
357,14,diag,1e-05,5,k-means++,100,7225380.500447838,6884830.5,-1208.9493,0.21065233647823334,449.29351806640625,2.369687246645459,True,41,14
|
||||
358,14,diag,1e-05,5,k-means++,300,7225380.500447838,6884830.5,-1208.9493,0.21065233647823334,449.29351806640625,2.369687246645459,True,41,14
|
||||
359,14,diag,0.0001,1,k-means++,100,7409969.000447838,7069419.0,-1241.9116,0.18569153547286987,462.6240539550781,1.8868216012452776,True,15,14
|
||||
360,14,diag,0.0001,1,k-means++,300,7409969.000447838,7069419.0,-1241.9116,0.18569153547286987,462.6240539550781,1.8868216012452776,True,15,14
|
||||
361,14,diag,0.0001,5,k-means++,100,7197481.000447838,6856931.0,-1203.9673,0.1839146614074707,442.7921142578125,2.4198138057845995,True,34,14
|
||||
362,14,diag,0.0001,5,k-means++,300,7197481.000447838,6856931.0,-1203.9673,0.1839146614074707,442.7921142578125,2.4198138057845995,True,34,14
|
||||
363,14,diag,0.001,1,k-means++,300,7428971.000447838,7088421.0,-1245.3048,0.18628861010074615,462.73419189453125,1.8868677406736207,False,300,14
|
||||
364,14,diag,0.001,5,k-means++,100,7217251.500447838,6876701.5,-1207.4978,0.18614986538887024,443.1617431640625,2.4207333709907854,True,30,14
|
||||
365,14,diag,0.001,5,k-means++,300,7217251.500447838,6876701.5,-1207.4978,0.18614986538887024,443.1617431640625,2.4207333709907854,True,30,14
|
||||
366,17,diag,1e-05,1,kmeans,100,7157834.272838381,6744308.0,-1179.4664,0.15990924835205078,403.0197448730469,2.0343843124532546,True,24,17
|
||||
367,17,diag,1e-05,1,kmeans,300,7157834.272838381,6744308.0,-1179.4664,0.15990924835205078,403.0197448730469,2.0343843124532546,True,24,17
|
||||
368,17,diag,1e-05,5,kmeans,100,6988291.272838381,6574765.0,-1149.1909,0.18784816563129425,396.8871154785156,2.38212018534803,True,20,17
|
||||
369,17,diag,1e-05,5,kmeans,300,6988291.272838381,6574765.0,-1149.1909,0.18784816563129425,396.8871154785156,2.38212018534803,True,20,17
|
||||
370,17,diag,0.0001,1,kmeans,100,7164777.772838381,6751251.5,-1180.7063,0.16118811070919037,403.645263671875,2.0161663571062354,True,12,17
|
||||
371,17,diag,0.0001,1,kmeans,300,7164777.772838381,6751251.5,-1180.7063,0.16118811070919037,403.645263671875,2.0161663571062354,True,12,17
|
||||
372,17,diag,0.0001,5,kmeans,100,6993985.272838381,6580459.0,-1150.2076,0.18836656212806702,396.94464111328125,2.382098456335865,True,20,17
|
||||
373,17,diag,0.0001,5,kmeans,300,6993985.272838381,6580459.0,-1150.2076,0.18836656212806702,396.94464111328125,2.382098456335865,True,20,17
|
||||
374,17,diag,0.001,1,kmeans,100,7020208.272838381,6606682.0,-1154.8904,0.16110706329345703,392.0225830078125,2.466771067136851,True,32,17
|
||||
375,17,diag,0.001,1,kmeans,300,7020208.272838381,6606682.0,-1154.8904,0.16110706329345703,392.0225830078125,2.466771067136851,True,32,17
|
||||
376,17,diag,0.001,5,kmeans,100,7014078.772838381,6600552.5,-1153.7958,0.19070318341255188,397.37750244140625,2.3799724457371485,True,20,17
|
||||
377,17,diag,0.001,5,kmeans,300,7014078.772838381,6600552.5,-1153.7958,0.19070318341255188,397.37750244140625,2.3799724457371485,True,20,17
|
||||
378,17,diag,1e-05,1,k-means++,100,7121674.772838381,6708148.5,-1173.0094,0.15012019872665405,375.2821350097656,2.4197980533663803,True,31,17
|
||||
379,17,diag,1e-05,1,k-means++,300,7121674.772838381,6708148.5,-1173.0094,0.15012019872665405,375.2821350097656,2.4197980533663803,True,31,17
|
||||
380,17,diag,1e-05,5,k-means++,100,7005072.772838381,6591546.5,-1152.1876,0.14115209877490997,384.4289245605469,2.4847770953101596,True,29,17
|
||||
381,17,diag,1e-05,5,k-means++,300,7005072.772838381,6591546.5,-1152.1876,0.14115209877490997,384.4289245605469,2.4847770953101596,True,29,17
|
||||
382,17,diag,0.0001,1,k-means++,100,7293509.772838381,6879983.5,-1203.6942,0.15238241851329803,397.42816162109375,2.107055060535422,True,15,17
|
||||
383,17,diag,0.0001,1,k-means++,300,7293509.772838381,6879983.5,-1203.6942,0.15238241851329803,397.42816162109375,2.107055060535422,True,15,17
|
||||
384,17,diag,0.0001,5,k-means++,100,7015674.772838381,6602148.5,-1154.0808,0.1819005310535431,394.13629150390625,2.4964933433175283,True,18,17
|
||||
385,17,diag,0.0001,5,k-means++,300,7015674.772838381,6602148.5,-1154.0808,0.1819005310535431,394.13629150390625,2.4964933433175283,True,18,17
|
||||
386,17,diag,0.001,1,k-means++,100,7312575.772838381,6899049.5,-1207.0989,0.15248946845531464,397.60723876953125,2.1086064619099547,True,17,17
|
||||
387,17,diag,0.001,1,k-means++,300,7312575.772838381,6899049.5,-1207.0989,0.15248946845531464,397.60723876953125,2.1086064619099547,True,17,17
|
||||
388,17,diag,0.001,5,k-means++,100,7034312.772838381,6620786.5,-1157.409,0.18249236047267914,394.5459289550781,2.4918179246451175,True,23,17
|
||||
389,17,diag,0.001,5,k-means++,300,7034312.772838381,6620786.5,-1157.409,0.18249236047267914,394.5459289550781,2.4918179246451175,True,23,17
|
||||
390,20,diag,1e-05,1,kmeans,100,6849987.045228925,6363484.5,-1107.0726,0.1538863182067871,351.1917419433594,2.4313421881484762,True,30,20
|
||||
391,20,diag,1e-05,1,kmeans,300,6849987.045228925,6363484.5,-1107.0726,0.1538863182067871,351.1917419433594,2.4313421881484762,True,30,20
|
||||
392,20,diag,1e-05,5,kmeans,100,6849987.045228925,6363484.5,-1107.0726,0.1538863182067871,351.1917419433594,2.4313421881484762,True,30,20
|
||||
393,20,diag,1e-05,5,kmeans,300,6849987.045228925,6363484.5,-1107.0726,0.1538863182067871,351.1917419433594,2.4313421881484762,True,30,20
|
||||
394,20,diag,0.0001,1,kmeans,100,6855879.045228925,6369376.5,-1108.1248,0.15445564687252045,351.1902160644531,2.4330055346823083,True,25,20
|
||||
395,20,diag,0.0001,1,kmeans,300,6855879.045228925,6369376.5,-1108.1248,0.15445564687252045,351.1902160644531,2.4330055346823083,True,25,20
|
||||
396,20,diag,0.0001,5,kmeans,100,6855879.045228925,6369376.5,-1108.1248,0.15445564687252045,351.1902160644531,2.4330055346823083,True,25,20
|
||||
397,20,diag,0.0001,5,kmeans,300,6855879.045228925,6369376.5,-1108.1248,0.15445564687252045,351.1902160644531,2.4330055346823083,True,25,20
|
||||
398,20,diag,0.001,1,kmeans,100,6875191.545228925,6388689.0,-1111.5734,0.15459507703781128,351.5787658691406,2.432923325373909,True,36,20
|
||||
399,20,diag,0.001,1,kmeans,300,6875191.545228925,6388689.0,-1111.5734,0.15459507703781128,351.5787658691406,2.432923325373909,True,36,20
|
||||
400,20,diag,0.001,5,kmeans,100,6875191.545228925,6388689.0,-1111.5734,0.15459507703781128,351.5787658691406,2.432923325373909,True,36,20
|
||||
401,20,diag,0.001,5,kmeans,300,6875191.545228925,6388689.0,-1111.5734,0.15459507703781128,351.5787658691406,2.432923325373909,True,36,20
|
||||
402,20,diag,1e-05,1,k-means++,100,6978855.045228925,6492352.5,-1130.0847,0.13519038259983063,338.322509765625,2.5026143875581077,True,24,20
|
||||
403,20,diag,1e-05,1,k-means++,300,6978855.045228925,6492352.5,-1130.0847,0.13519038259983063,338.322509765625,2.5026143875581077,True,24,20
|
||||
404,20,diag,1e-05,5,k-means++,100,6897127.045228925,6410624.5,-1115.4905,0.13251666724681854,352.5394592285156,2.4669189695674225,True,42,20
|
||||
405,20,diag,1e-05,5,k-means++,300,6897127.045228925,6410624.5,-1115.4905,0.13251666724681854,352.5394592285156,2.4669189695674225,True,42,20
|
||||
406,20,diag,0.0001,1,k-means++,100,7011968.045228925,6525465.5,-1135.9978,0.14400699734687805,344.567138671875,2.517887865440349,True,30,20
|
||||
407,20,diag,0.0001,1,k-means++,300,7011968.045228925,6525465.5,-1135.9978,0.14400699734687805,344.567138671875,2.517887865440349,True,30,20
|
||||
408,20,diag,0.0001,5,k-means++,100,6905988.545228925,6419486.0,-1117.0729,0.13107705116271973,351.8740234375,2.4956842864961937,True,36,20
|
||||
409,20,diag,0.0001,5,k-means++,300,6905988.545228925,6419486.0,-1117.0729,0.13107705116271973,351.8740234375,2.4956842864961937,True,36,20
|
||||
410,20,diag,0.001,1,k-means++,100,7031180.545228925,6544678.0,-1139.4286,0.14613750576972961,345.2534484863281,2.516432567197497,True,27,20
|
||||
411,20,diag,0.001,1,k-means++,300,7031180.545228925,6544678.0,-1139.4286,0.14613750576972961,345.2534484863281,2.516432567197497,True,27,20
|
||||
412,20,diag,0.001,5,k-means++,100,6918391.545228925,6431889.0,-1119.2877,0.13308578729629517,351.49005126953125,2.474649164658472,True,35,20
|
||||
413,20,diag,0.001,5,k-means++,300,6918391.545228925,6431889.0,-1119.2877,0.13308578729629517,351.49005126953125,2.474649164658472,True,35,20
|
|
1009
cluster/gmm_grid_search_summary_20250805_150635.csv
Normal file
1009
cluster/gmm_grid_search_summary_20250805_150635.csv
Normal file
File diff suppressed because it is too large
Load Diff
158
cluster/log_gmm_extensive.txt
Normal file
158
cluster/log_gmm_extensive.txt
Normal file
@@ -0,0 +1,158 @@
|
||||
nohup: ignoring input
|
||||
Loading embeddings from /home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json...
|
||||
Loaded 2800 samples with embedding dimension 2048
|
||||
|
||||
======================================================================
|
||||
RUNNING GAUSSIAN MIXTURE MODEL CLUSTERING WITH OPTIMIZED GRID SEARCH
|
||||
======================================================================
|
||||
Optimized parameter combinations:
|
||||
- n_components: 11 values [2, 3, 4, 5, 6, 8, 10, 11, 14, 17, 20]
|
||||
- covariance_types: 2 options ['full', 'diag']
|
||||
- reg_covar: 3 values [1e-05, 0.0001, 0.001]
|
||||
- n_init: 2 values [1, 5]
|
||||
- init_params: 2 options ['kmeans', 'k-means++']
|
||||
- max_iter: 2 values [100, 300]
|
||||
Total combinations: 528 (optimized for speed)
|
||||
Estimated runtime: 4.4 minutes
|
||||
This should be much faster...
|
||||
|
||||
Progress: 50/528 (9.5%) - Best scores so far: BIC=17260132.61, Silhouette=0.376
|
||||
Progress: 100/528 (18.9%) - Best scores so far: BIC=17260132.61, Silhouette=0.376
|
||||
Progress: 150/528 (28.4%) - Best scores so far: BIC=17260132.61, Silhouette=0.376
|
||||
Progress: 200/528 (37.9%) - Best scores so far: BIC=17260132.61, Silhouette=0.376
|
||||
Progress: 250/528 (47.3%) - Best scores so far: BIC=17260132.61, Silhouette=0.376
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=kmeans: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089173.91, AIC=13040529.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=2, cov=diag, init=k-means++: BIC=13089203.91, AIC=13040559.00, silhouette=0.3697
|
||||
n_components=3, cov=diag, init=kmeans: BIC=12693850.34, AIC=12620880.00, silhouette=0.3761
|
||||
n_components=3, cov=diag, init=kmeans: BIC=12693850.34, AIC=12620880.00, silhouette=0.3761
|
||||
n_components=3, cov=diag, init=kmeans: BIC=12699627.34, AIC=12626657.00, silhouette=0.3761
|
||||
n_components=3, cov=diag, init=kmeans: BIC=12699627.34, AIC=12626657.00, silhouette=0.3761
|
||||
n_components=3, cov=diag, init=kmeans: BIC=12718245.34, AIC=12645275.00, silhouette=0.3761
|
||||
n_components=3, cov=diag, init=kmeans: BIC=12718245.34, AIC=12645275.00, silhouette=0.3761
|
||||
Progress: 300/528 (56.8%) - Best scores so far: BIC=11770626.34, Silhouette=0.376
|
||||
n_components=4, cov=diag, init=kmeans: BIC=11525150.76, AIC=11427855.00, silhouette=0.3090
|
||||
n_components=4, cov=diag, init=kmeans: BIC=11525150.76, AIC=11427855.00, silhouette=0.3090
|
||||
n_components=4, cov=diag, init=kmeans: BIC=11530927.76, AIC=11433632.00, silhouette=0.3090
|
||||
n_components=4, cov=diag, init=kmeans: BIC=11530927.76, AIC=11433632.00, silhouette=0.3090
|
||||
n_components=4, cov=diag, init=kmeans: BIC=11549555.76, AIC=11452260.00, silhouette=0.3090
|
||||
n_components=4, cov=diag, init=kmeans: BIC=11549555.76, AIC=11452260.00, silhouette=0.3090
|
||||
n_components=5, cov=diag, init=kmeans: BIC=10641753.18, AIC=10520132.00, silhouette=0.3119
|
||||
n_components=5, cov=diag, init=kmeans: BIC=10641753.18, AIC=10520132.00, silhouette=0.3119
|
||||
n_components=5, cov=diag, init=kmeans: BIC=10647529.18, AIC=10525908.00, silhouette=0.3119
|
||||
n_components=5, cov=diag, init=kmeans: BIC=10647529.18, AIC=10525908.00, silhouette=0.3119
|
||||
n_components=5, cov=diag, init=kmeans: BIC=10666196.18, AIC=10544575.00, silhouette=0.3119
|
||||
n_components=5, cov=diag, init=kmeans: BIC=10666196.18, AIC=10544575.00, silhouette=0.3119
|
||||
Progress: 350/528 (66.3%) - Best scores so far: BIC=9931250.18, Silhouette=0.376
|
||||
Progress: 400/528 (75.8%) - Best scores so far: BIC=8401628.46, Silhouette=0.376
|
||||
Progress: 450/528 (85.2%) - Best scores so far: BIC=7579813.73, Silhouette=0.376
|
||||
Progress: 500/528 (94.7%) - Best scores so far: BIC=6988291.27, Silhouette=0.376
|
||||
Progress: 528/528 (100.0%) - Best scores so far: BIC=6849987.05, Silhouette=0.376
|
||||
|
||||
======================================================================
|
||||
GAUSSIAN MIXTURE MODEL GRID SEARCH ANALYSIS
|
||||
======================================================================
|
||||
Total parameter combinations tested: 413
|
||||
Combinations with valid clustering: 413
|
||||
|
||||
Model Selection Metrics:
|
||||
Best BIC score: 6849987.05
|
||||
Best AIC score: -11119584.00
|
||||
Best Log-Likelihood: 6594.97
|
||||
|
||||
Clustering Quality Metrics:
|
||||
Best silhouette score: 0.3761
|
||||
Mean silhouette score: 0.2317
|
||||
Best Calinski-Harabasz score: 1331.69
|
||||
Best Davies-Bouldin score: 0.7860
|
||||
|
||||
Top 5 results by BIC (lower is better):
|
||||
n_comp=20, cov=diag: BIC=6849987.05, AIC=6363484.50
|
||||
n_comp=20, cov=diag: BIC=6849987.05, AIC=6363484.50
|
||||
n_comp=20, cov=diag: BIC=6849987.05, AIC=6363484.50
|
||||
n_comp=20, cov=diag: BIC=6849987.05, AIC=6363484.50
|
||||
n_comp=20, cov=diag: BIC=6855879.05, AIC=6369376.50
|
||||
|
||||
Top 5 results by AIC (lower is better):
|
||||
n_comp=4, cov=full: BIC=38759701.15, AIC=-11119584.00
|
||||
n_comp=4, cov=full: BIC=38759701.15, AIC=-11119584.00
|
||||
n_comp=3, cov=full: BIC=26462676.38, AIC=-10946786.00
|
||||
n_comp=3, cov=full: BIC=26462676.38, AIC=-10946786.00
|
||||
n_comp=5, cov=full: BIC=54230057.92, AIC=-8119050.00
|
||||
|
||||
Top 5 results by Silhouette Score:
|
||||
n_comp=3, cov=diag: silhouette=0.3761
|
||||
n_comp=3, cov=diag: silhouette=0.3761
|
||||
n_comp=3, cov=diag: silhouette=0.3761
|
||||
n_comp=3, cov=diag: silhouette=0.3761
|
||||
n_comp=3, cov=diag: silhouette=0.3761
|
||||
|
||||
Component count analysis (top 10 by BIC):
|
||||
20.0 components: BIC=6849987.05, AIC=6363484.50, silhouette=0.1770
|
||||
17.0 components: BIC=6988291.27, AIC=6574765.00, silhouette=0.2085
|
||||
14.0 components: BIC=7179637.00, AIC=6839087.00, silhouette=0.2119
|
||||
11.0 components: BIC=7579813.73, AIC=7312240.00, silhouette=0.2577
|
||||
10.0 components: BIC=7737961.30, AIC=7494713.00, silhouette=0.2863
|
||||
8.0 components: BIC=8401628.46, AIC=1051428.00, silhouette=0.2748
|
||||
6.0 components: BIC=9102218.61, AIC=-6065602.00, silhouette=0.2707
|
||||
5.0 components: BIC=9931250.18, AIC=-8119050.00, silhouette=0.3163
|
||||
4.0 components: BIC=10865268.76, AIC=-11119584.00, silhouette=0.3110
|
||||
3.0 components: BIC=11686081.34, AIC=-10946786.00, silhouette=0.3761
|
||||
|
||||
📁 SAVING DETAILED RESULTS...
|
||||
==============================
|
||||
Detailed grid search results saved to: gmm_grid_search_detailed_20250801_015245.json
|
||||
Grid search summary CSV saved to: gmm_grid_search_summary_20250801_015245.csv
|
||||
|
||||
Best GMM result by BIC:
|
||||
Parameters: {'n_components': 20, 'covariance_type': 'diag', 'reg_covar': 1e-05, 'n_init': 1, 'init_params': 'kmeans', 'max_iter': 100}
|
||||
BIC score: 6849987.05
|
||||
|
||||
Best GMM result by AIC:
|
||||
Parameters: {'n_components': 4, 'covariance_type': 'full', 'reg_covar': 0.0001, 'n_init': 5, 'init_params': 'kmeans', 'max_iter': 100}
|
||||
AIC score: -11119584.00
|
||||
|
||||
Best GMM result by Silhouette:
|
||||
Parameters: {'n_components': 3, 'covariance_type': 'diag', 'reg_covar': 1e-05, 'n_init': 1, 'init_params': 'kmeans', 'max_iter': 100}
|
||||
Silhouette score: 0.3761
|
||||
Visualization saved as 'gmm_clustering_results.png'
|
||||
Final clustering results (bic) saved to: gmm_final_results_bic_20250801_015247.json
|
||||
Traceback (most recent call last):
|
||||
File "/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_extensive.py", line 646, in <module>
|
||||
main()
|
||||
File "/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_extensive.py", line 640, in main
|
||||
clustering.save_clustering_results(results)
|
||||
File "/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_extensive.py", line 614, in save_clustering_results
|
||||
json.dump({
|
||||
File "/home/nguyendc/miniconda3/envs/cluster/lib/python3.10/json/__init__.py", line 179, in dump
|
||||
for chunk in iterable:
|
||||
File "/home/nguyendc/miniconda3/envs/cluster/lib/python3.10/json/encoder.py", line 431, in _iterencode
|
||||
yield from _iterencode_dict(o, _current_indent_level)
|
||||
File "/home/nguyendc/miniconda3/envs/cluster/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
|
||||
yield from chunks
|
||||
File "/home/nguyendc/miniconda3/envs/cluster/lib/python3.10/json/encoder.py", line 438, in _iterencode
|
||||
o = _default(o)
|
||||
File "/home/nguyendc/miniconda3/envs/cluster/lib/python3.10/json/encoder.py", line 179, in default
|
||||
raise TypeError(f'Object of type {o.__class__.__name__} '
|
||||
TypeError: Object of type float32 is not JSON serializable
|
140
cluster/log_gmm_extensive_update.txt
Normal file
140
cluster/log_gmm_extensive_update.txt
Normal file
@@ -0,0 +1,140 @@
|
||||
nohup: ignoring input
|
||||
Loading embeddings from /home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json...
|
||||
Loaded 2800 samples with embedding dimension 2048
|
||||
|
||||
======================================================================
|
||||
RUNNING GAUSSIAN MIXTURE MODEL CLUSTERING WITH OPTIMIZED GRID SEARCH
|
||||
======================================================================
|
||||
Optimized parameter combinations:
|
||||
- n_components: 21 values [2, 3, 4, 5, 6, 8, 10, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50]
|
||||
- covariance_types: 2 options ['tied', 'spherical']
|
||||
- reg_covar: 3 values [1e-05, 0.0001, 0.001]
|
||||
- n_init: 2 values [1, 5]
|
||||
- init_params: 2 options ['kmeans', 'k-means++']
|
||||
- max_iter: 2 values [100, 300]
|
||||
Total combinations: 1008 (optimized for speed)
|
||||
Estimated runtime: 8.4 minutes
|
||||
This should be much faster...
|
||||
|
||||
n_components=2, cov=tied, init=kmeans: BIC=6521812.14, AIC=-5960170.38, silhouette=0.3692
|
||||
n_components=3, cov=tied, init=kmeans: BIC=6511443.85, AIC=-5982704.34, silhouette=0.3756
|
||||
n_components=3, cov=tied, init=kmeans: BIC=6511443.85, AIC=-5982704.34, silhouette=0.3756
|
||||
n_components=3, cov=tied, init=kmeans: BIC=6511443.85, AIC=-5982704.34, silhouette=0.3756
|
||||
n_components=3, cov=tied, init=kmeans: BIC=6511443.85, AIC=-5982704.34, silhouette=0.3756
|
||||
n_components=4, cov=tied, init=kmeans: BIC=6514783.32, AIC=-5991530.55, silhouette=0.3110
|
||||
Progress: 50/1008 (5.0%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
n_components=4, cov=tied, init=kmeans: BIC=6514783.32, AIC=-5991530.55, silhouette=0.3110
|
||||
n_components=4, cov=tied, init=kmeans: BIC=6514783.32, AIC=-5991530.55, silhouette=0.3110
|
||||
n_components=4, cov=tied, init=kmeans: BIC=6514783.32, AIC=-5991530.55, silhouette=0.3110
|
||||
n_components=5, cov=tied, init=kmeans: BIC=6520503.08, AIC=-5997976.48, silhouette=0.3163
|
||||
n_components=5, cov=tied, init=kmeans: BIC=6520503.08, AIC=-5997976.48, silhouette=0.3163
|
||||
n_components=5, cov=tied, init=kmeans: BIC=6520503.08, AIC=-5997976.48, silhouette=0.3163
|
||||
n_components=5, cov=tied, init=kmeans: BIC=6520503.08, AIC=-5997976.48, silhouette=0.3163
|
||||
Progress: 100/1008 (9.9%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 150/1008 (14.9%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 200/1008 (19.8%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 250/1008 (24.8%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 300/1008 (29.8%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 350/1008 (34.7%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 400/1008 (39.7%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 450/1008 (44.6%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 500/1008 (49.6%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 550/1008 (54.6%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 600/1008 (59.5%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 650/1008 (64.5%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 700/1008 (69.4%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 750/1008 (74.4%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 800/1008 (79.4%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 850/1008 (84.3%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 900/1008 (89.3%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 950/1008 (94.2%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 1000/1008 (99.2%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
Progress: 1008/1008 (100.0%) - Best scores so far: BIC=6511443.85, Silhouette=0.376
|
||||
|
||||
======================================================================
|
||||
GAUSSIAN MIXTURE MODEL GRID SEARCH ANALYSIS
|
||||
======================================================================
|
||||
Total parameter combinations tested: 1008
|
||||
Combinations with valid clustering: 1008
|
||||
|
||||
Model Selection Metrics:
|
||||
Best BIC score: 6511443.85
|
||||
Best AIC score: -6295231.48
|
||||
Best Log-Likelihood: 1910.09
|
||||
|
||||
Clustering Quality Metrics:
|
||||
Best silhouette score: 0.3757
|
||||
Mean silhouette score: 0.0287
|
||||
Best Calinski-Harabasz score: 1331.69
|
||||
Best Davies-Bouldin score: 0.6762
|
||||
|
||||
Top 5 results by BIC (lower is better):
|
||||
n_comp=3, cov=tied: BIC=6511443.85, AIC=-5982704.34
|
||||
n_comp=3, cov=tied: BIC=6511443.85, AIC=-5982704.34
|
||||
n_comp=3, cov=tied: BIC=6511443.85, AIC=-5982704.34
|
||||
n_comp=3, cov=tied: BIC=6511443.85, AIC=-5982704.34
|
||||
n_comp=4, cov=tied: BIC=6514783.32, AIC=-5991530.55
|
||||
|
||||
Top 5 results by AIC (lower is better):
|
||||
n_comp=50, cov=tied: BIC=6770703.71, AIC=-6295231.48
|
||||
n_comp=50, cov=tied: BIC=6770703.71, AIC=-6295231.48
|
||||
n_comp=50, cov=tied: BIC=6779928.76, AIC=-6286006.43
|
||||
n_comp=50, cov=tied: BIC=6779928.76, AIC=-6286006.43
|
||||
n_comp=47, cov=tied: BIC=6755535.12, AIC=-6273903.03
|
||||
|
||||
Top 5 results by Silhouette Score:
|
||||
n_comp=3, cov=spherical: silhouette=0.3757
|
||||
n_comp=3, cov=spherical: silhouette=0.3757
|
||||
n_comp=3, cov=spherical: silhouette=0.3757
|
||||
n_comp=3, cov=spherical: silhouette=0.3757
|
||||
n_comp=3, cov=spherical: silhouette=0.3757
|
||||
|
||||
Component count analysis (top 10 by BIC):
|
||||
3.0 components: BIC=6511443.85, AIC=-5982704.34, silhouette=0.3757
|
||||
4.0 components: BIC=6514783.32, AIC=-5991530.55, silhouette=0.3110
|
||||
5.0 components: BIC=6520503.08, AIC=-5997976.48, silhouette=0.3163
|
||||
2.0 components: BIC=6521812.14, AIC=-5960170.38, silhouette=0.3693
|
||||
6.0 components: BIC=6526215.27, AIC=-6004429.97, silhouette=0.2485
|
||||
8.0 components: BIC=6529704.08, AIC=-6025272.52, silhouette=0.2680
|
||||
10.0 components: BIC=6538644.29, AIC=-6040663.67, silhouette=0.2706
|
||||
11.0 components: BIC=6546208.81, AIC=-6045264.84, silhouette=0.2580
|
||||
14.0 components: BIC=6563001.35, AIC=-6064969.34, silhouette=0.2241
|
||||
17.0 components: BIC=6580862.17, AIC=-6083605.55, silhouette=0.2109
|
||||
|
||||
📁 SAVING DETAILED RESULTS...
|
||||
==============================
|
||||
Detailed grid search results saved to: gmm_grid_search_detailed_20250805_150635.json
|
||||
Grid search summary CSV saved to: gmm_grid_search_summary_20250805_150635.csv
|
||||
|
||||
Best GMM result by BIC:
|
||||
Parameters: {'n_components': 3, 'covariance_type': 'tied', 'reg_covar': 1e-05, 'n_init': 1, 'init_params': 'kmeans', 'max_iter': 100}
|
||||
BIC score: 6511443.85
|
||||
|
||||
Best GMM result by AIC:
|
||||
Parameters: {'n_components': 50, 'covariance_type': 'tied', 'reg_covar': 1e-05, 'n_init': 5, 'init_params': 'kmeans', 'max_iter': 100}
|
||||
AIC score: -6295231.48
|
||||
|
||||
Best GMM result by Silhouette:
|
||||
Parameters: {'n_components': 3, 'covariance_type': 'spherical', 'reg_covar': 1e-05, 'n_init': 1, 'init_params': 'kmeans', 'max_iter': 100}
|
||||
Silhouette score: 0.3757
|
||||
Visualization saved as 'gmm_clustering_results.png'
|
||||
Final clustering results (bic) saved to: gmm_final_results_bic_20250805_150636.json
|
||||
Final clustering results (aic) saved to: gmm_final_results_aic_20250805_150636.json
|
||||
Traceback (most recent call last):
|
||||
File "/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_extensive.py", line 649, in <module>
|
||||
main()
|
||||
File "/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_extensive.py", line 643, in main
|
||||
clustering.save_clustering_results(results)
|
||||
File "/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_extensive.py", line 617, in save_clustering_results
|
||||
json.dump({
|
||||
File "/usr/lib/python3.10/json/__init__.py", line 179, in dump
|
||||
for chunk in iterable:
|
||||
File "/usr/lib/python3.10/json/encoder.py", line 431, in _iterencode
|
||||
yield from _iterencode_dict(o, _current_indent_level)
|
||||
File "/usr/lib/python3.10/json/encoder.py", line 405, in _iterencode_dict
|
||||
yield from chunks
|
||||
File "/usr/lib/python3.10/json/encoder.py", line 438, in _iterencode
|
||||
o = _default(o)
|
||||
File "/usr/lib/python3.10/json/encoder.py", line 179, in default
|
||||
raise TypeError(f'Object of type {o.__class__.__name__} '
|
||||
TypeError: Object of type float32 is not JSON serializable
|
Reference in New Issue
Block a user