update source code and pipeline
This commit is contained in:
670
cluster/auto_cluster copy.py
Normal file
670
cluster/auto_cluster copy.py
Normal file
@@ -0,0 +1,670 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation)
|
||||
These methods don't require specifying the number of clusters beforehand.
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import silhouette_score
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.decomposition import PCA
|
||||
import argparse
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
class AutoClustering:
|
||||
def __init__(self, embeddings_path):
|
||||
self.embeddings_path = embeddings_path
|
||||
self.embeddings = None
|
||||
self.file_paths = None
|
||||
self.load_embeddings()
|
||||
|
||||
def load_embeddings(self):
|
||||
"""Load embeddings from JSON file"""
|
||||
print(f"Loading embeddings from {self.embeddings_path}...")
|
||||
with open(self.embeddings_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.file_paths = []
|
||||
embeddings_list = []
|
||||
|
||||
for item in data:
|
||||
self.file_paths.append(item['filepath'])
|
||||
embeddings_list.append(item['embedding'])
|
||||
|
||||
self.embeddings = np.array(embeddings_list, dtype=np.float32)
|
||||
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
|
||||
|
||||
# Standardize embeddings for better clustering
|
||||
self.scaler = StandardScaler()
|
||||
self.embeddings_scaled = self.scaler.fit_transform(self.embeddings)
|
||||
|
||||
def run_dbscan(self):
|
||||
"""Run DBSCAN with extensive grid search for parameter estimation"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH")
|
||||
print("="*50)
|
||||
|
||||
# Method 1: K-nearest neighbors approach with multiple k values
|
||||
eps_candidates = []
|
||||
|
||||
# Try different k values for nearest neighbors
|
||||
k_values = [5, 10, 15, 20, 25, 30]
|
||||
for k in k_values:
|
||||
k_actual = min(k, len(self.embeddings_scaled) // 4)
|
||||
if k_actual < 3:
|
||||
continue
|
||||
|
||||
neighbors = NearestNeighbors(n_neighbors=k_actual)
|
||||
neighbors_fit = neighbors.fit(self.embeddings_scaled)
|
||||
distances, indices = neighbors_fit.kneighbors(self.embeddings_scaled)
|
||||
|
||||
# Sort distances and use k-th nearest neighbor distance
|
||||
distances = np.sort(distances, axis=0)
|
||||
kth_distances = distances[:, k_actual-1]
|
||||
|
||||
# Multiple percentile thresholds for each k
|
||||
percentiles = [60, 65, 70, 75, 80, 85, 90, 95]
|
||||
for p in percentiles:
|
||||
eps_candidates.append(np.percentile(kth_distances, p))
|
||||
|
||||
# Method 2: Statistical measures
|
||||
# Mean and std of pairwise distances (sampled for efficiency)
|
||||
sample_size = min(1000, len(self.embeddings_scaled))
|
||||
sample_indices = np.random.choice(len(self.embeddings_scaled), sample_size, replace=False)
|
||||
sample_data = self.embeddings_scaled[sample_indices]
|
||||
|
||||
from scipy.spatial.distance import pdist
|
||||
pairwise_distances = pdist(sample_data)
|
||||
|
||||
# Add statistical measures as eps candidates
|
||||
eps_candidates.extend([
|
||||
np.mean(pairwise_distances) * 0.3,
|
||||
np.mean(pairwise_distances) * 0.4,
|
||||
np.mean(pairwise_distances) * 0.5,
|
||||
np.mean(pairwise_distances) * 0.6,
|
||||
np.mean(pairwise_distances) * 0.7,
|
||||
np.median(pairwise_distances) * 0.3,
|
||||
np.median(pairwise_distances) * 0.4,
|
||||
np.median(pairwise_distances) * 0.5,
|
||||
np.median(pairwise_distances) * 0.6,
|
||||
np.std(pairwise_distances) * 0.5,
|
||||
np.std(pairwise_distances) * 0.8,
|
||||
np.std(pairwise_distances) * 1.0,
|
||||
np.std(pairwise_distances) * 1.2
|
||||
])
|
||||
|
||||
# Method 3: Manual eps values for different scales
|
||||
manual_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
|
||||
1.2, 1.5, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0]
|
||||
eps_candidates.extend(manual_eps)
|
||||
|
||||
# Remove duplicates and invalid values, then sort
|
||||
eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0])))
|
||||
|
||||
# Extensive min_samples candidates
|
||||
min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50]
|
||||
|
||||
# Filter min_samples based on dataset size
|
||||
# max_min_samples = len(self.embeddings_scaled) // 10 # At most 10% of data
|
||||
# min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples]
|
||||
min_samples_candidates = [60]
|
||||
|
||||
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_labels = None
|
||||
|
||||
print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values")
|
||||
print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}")
|
||||
print("This may take a while...\n")
|
||||
|
||||
# Track all results for analysis
|
||||
all_results = []
|
||||
|
||||
total_combinations = len(eps_candidates) * len(min_samples_candidates)
|
||||
current_combination = 0
|
||||
|
||||
for eps in eps_candidates:
|
||||
for min_samples in min_samples_candidates:
|
||||
current_combination += 1
|
||||
|
||||
# Progress indicator
|
||||
if current_combination % 50 == 0 or current_combination == total_combinations:
|
||||
progress = (current_combination / total_combinations) * 100
|
||||
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)")
|
||||
|
||||
try:
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||||
labels = dbscan.fit_predict(self.embeddings_scaled)
|
||||
|
||||
# Check if we have meaningful clusters
|
||||
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
n_noise = list(labels).count(-1)
|
||||
noise_ratio = n_noise / len(labels)
|
||||
|
||||
# Store result for analysis
|
||||
result_info = {
|
||||
'eps': eps,
|
||||
'min_samples': min_samples,
|
||||
'n_clusters': n_clusters,
|
||||
'n_noise': n_noise,
|
||||
'noise_ratio': noise_ratio
|
||||
}
|
||||
|
||||
# Check if we have meaningful clusters
|
||||
if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points
|
||||
# Calculate silhouette score (excluding noise)
|
||||
mask = labels != -1
|
||||
if np.sum(mask) > 1:
|
||||
try:
|
||||
score = silhouette_score(self.embeddings_scaled[mask], labels[mask])
|
||||
result_info['silhouette_score'] = score
|
||||
|
||||
# Print promising results
|
||||
if score > 0.1: # Only show decent scores
|
||||
print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_params = (eps, min_samples)
|
||||
best_labels = labels
|
||||
except Exception:
|
||||
result_info['silhouette_score'] = None
|
||||
else:
|
||||
result_info['silhouette_score'] = None
|
||||
|
||||
all_results.append(result_info)
|
||||
|
||||
except Exception as e:
|
||||
# Skip problematic parameter combinations
|
||||
continue
|
||||
|
||||
# Analysis of results
|
||||
print("\n" + "="*50)
|
||||
print("DBSCAN GRID SEARCH ANALYSIS")
|
||||
print("="*50)
|
||||
|
||||
if all_results:
|
||||
# Convert to numpy for easier analysis
|
||||
import pandas as pd
|
||||
df_results = pd.DataFrame(all_results)
|
||||
|
||||
print(f"Total parameter combinations tested: {len(df_results)}")
|
||||
|
||||
# Valid results (with clusters)
|
||||
valid_results = df_results[df_results['n_clusters'] >= 2]
|
||||
print(f"Combinations that produced clusters: {len(valid_results)}")
|
||||
|
||||
if len(valid_results) > 0:
|
||||
# Best silhouette scores
|
||||
scored_results = valid_results.dropna(subset=['silhouette_score'])
|
||||
if len(scored_results) > 0:
|
||||
print(f"Combinations with valid silhouette scores: {len(scored_results)}")
|
||||
print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}")
|
||||
print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}")
|
||||
|
||||
# Top 5 results
|
||||
top_results = scored_results.nlargest(5, 'silhouette_score')
|
||||
print("\nTop 5 parameter combinations:")
|
||||
for idx, row in top_results.iterrows():
|
||||
print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: "
|
||||
f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}")
|
||||
|
||||
# Cluster count distribution
|
||||
cluster_counts = valid_results['n_clusters'].value_counts().sort_index()
|
||||
print(f"\nCluster count distribution:")
|
||||
for n_clusters, count in cluster_counts.items():
|
||||
print(f" {n_clusters} clusters: {count} parameter combinations")
|
||||
|
||||
print(f"\n📁 SAVING DETAILED RESULTS...")
|
||||
print("="*30)
|
||||
|
||||
# Save detailed grid search results to JSON file
|
||||
self.save_dbscan_grid_search_results(all_results, best_params, best_score)
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0)
|
||||
n_noise = list(best_labels).count(-1)
|
||||
|
||||
print(f"\nBest DBSCAN result:")
|
||||
print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("DBSCAN could not find suitable clusters with the extensive grid search")
|
||||
print("Consider:")
|
||||
print("- Adjusting the embedding space (different model or preprocessing)")
|
||||
print("- Using different clustering algorithms")
|
||||
print("- Manual parameter tuning based on domain knowledge")
|
||||
return None
|
||||
|
||||
def save_dbscan_grid_search_results(self, all_results, best_params, best_score):
|
||||
"""Save detailed DBSCAN grid search results to JSON file"""
|
||||
import datetime
|
||||
|
||||
# Prepare comprehensive results data
|
||||
grid_search_data = {
|
||||
"experiment_info": {
|
||||
"timestamp": datetime.datetime.now().isoformat(),
|
||||
"dataset_path": self.embeddings_path,
|
||||
"total_samples": len(self.file_paths),
|
||||
"embedding_dimension": self.embeddings.shape[1],
|
||||
"total_combinations_tested": len(all_results)
|
||||
},
|
||||
"best_result": {
|
||||
"eps": best_params[0] if best_params else None,
|
||||
"min_samples": best_params[1] if best_params else None,
|
||||
"silhouette_score": best_score if best_score > -1 else None
|
||||
},
|
||||
"all_trials": []
|
||||
}
|
||||
|
||||
# Add all trial results
|
||||
for i, result in enumerate(all_results):
|
||||
trial_data = {
|
||||
"trial_id": i + 1,
|
||||
"parameters": {
|
||||
"eps": result['eps'],
|
||||
"min_samples": result['min_samples']
|
||||
},
|
||||
"results": {
|
||||
"n_clusters": result['n_clusters'],
|
||||
"n_noise": result['n_noise'],
|
||||
"noise_ratio": result['noise_ratio'],
|
||||
"silhouette_score": result['silhouette_score']
|
||||
},
|
||||
"status": "success" if result['silhouette_score'] is not None else "failed"
|
||||
}
|
||||
grid_search_data["all_trials"].append(trial_data)
|
||||
|
||||
# Calculate summary statistics
|
||||
valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"]
|
||||
if valid_trials:
|
||||
silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None]
|
||||
if silhouette_scores:
|
||||
grid_search_data["summary_statistics"] = {
|
||||
"total_trials": len(all_results),
|
||||
"successful_trials": len(valid_trials),
|
||||
"success_rate": len(valid_trials) / len(all_results),
|
||||
"best_silhouette_score": max(silhouette_scores),
|
||||
"worst_silhouette_score": min(silhouette_scores),
|
||||
"mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores),
|
||||
"median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2]
|
||||
}
|
||||
|
||||
# Top 10 results
|
||||
sorted_valid_trials = sorted(valid_trials,
|
||||
key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1,
|
||||
reverse=True)
|
||||
grid_search_data["top_10_results"] = sorted_valid_trials[:10]
|
||||
|
||||
# Parameter analysis
|
||||
eps_values = [t["parameters"]["eps"] for t in valid_trials]
|
||||
min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials]
|
||||
|
||||
grid_search_data["parameter_analysis"] = {
|
||||
"eps_range": {
|
||||
"min": min(eps_values),
|
||||
"max": max(eps_values),
|
||||
"mean": sum(eps_values) / len(eps_values)
|
||||
},
|
||||
"min_samples_range": {
|
||||
"min": min(min_samples_values),
|
||||
"max": max(min_samples_values),
|
||||
"mean": sum(min_samples_values) / len(min_samples_values)
|
||||
}
|
||||
}
|
||||
|
||||
# Save to file with timestamp
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"dbscan_grid_search_detailed_{timestamp}.json"
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Detailed grid search results saved to: {filename}")
|
||||
|
||||
# Also save a CSV summary for easy analysis
|
||||
csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv"
|
||||
self.save_grid_search_csv(all_results, csv_filename)
|
||||
print(f"Grid search summary CSV saved to: {csv_filename}")
|
||||
|
||||
def save_grid_search_csv(self, all_results, filename):
|
||||
"""Save grid search results as CSV for easy analysis"""
|
||||
import csv
|
||||
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise',
|
||||
'noise_ratio', 'silhouette_score', 'status']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for i, result in enumerate(all_results):
|
||||
writer.writerow({
|
||||
'trial_id': i + 1,
|
||||
'eps': result['eps'],
|
||||
'min_samples': result['min_samples'],
|
||||
'n_clusters': result['n_clusters'],
|
||||
'n_noise': result['n_noise'],
|
||||
'noise_ratio': result['noise_ratio'],
|
||||
'silhouette_score': result['silhouette_score'],
|
||||
'status': 'success' if result['silhouette_score'] is not None else 'failed'
|
||||
})
|
||||
|
||||
def run_mean_shift(self):
|
||||
"""Run Mean Shift clustering"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING MEAN SHIFT CLUSTERING")
|
||||
print("="*50)
|
||||
|
||||
# Estimate bandwidth using different percentiles
|
||||
from sklearn.cluster import estimate_bandwidth
|
||||
|
||||
# Try different bandwidth estimation methods
|
||||
bandwidth_candidates = []
|
||||
|
||||
# Method 1: sklearn's estimate_bandwidth
|
||||
try:
|
||||
bw_est = estimate_bandwidth(self.embeddings_scaled, quantile=0.3, n_samples=min(500, len(self.embeddings_scaled)))
|
||||
if bw_est > 0:
|
||||
bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5])
|
||||
except:
|
||||
pass
|
||||
|
||||
# Method 2: nearest neighbor distances
|
||||
neighbors = NearestNeighbors(n_neighbors=10)
|
||||
neighbors_fit = neighbors.fit(self.embeddings_scaled)
|
||||
distances, _ = neighbors_fit.kneighbors(self.embeddings_scaled)
|
||||
mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance
|
||||
bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5])
|
||||
|
||||
# Remove duplicates and invalid values
|
||||
bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0]))
|
||||
|
||||
if not bandwidth_candidates:
|
||||
bandwidth_candidates = [0.5, 1.0, 1.5, 2.0]
|
||||
|
||||
best_score = -1
|
||||
best_bandwidth = None
|
||||
best_labels = None
|
||||
|
||||
print("Testing different bandwidth values...")
|
||||
for bandwidth in bandwidth_candidates:
|
||||
try:
|
||||
mean_shift = MeanShift(bandwidth=bandwidth)
|
||||
labels = mean_shift.fit_predict(self.embeddings_scaled)
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
|
||||
score = silhouette_score(self.embeddings_scaled, labels)
|
||||
print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_bandwidth = bandwidth
|
||||
best_labels = labels
|
||||
except Exception as e:
|
||||
print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)")
|
||||
continue
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels))
|
||||
print(f"\nBest Mean Shift result:")
|
||||
print(f"Bandwidth: {best_bandwidth:.4f}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("Mean Shift could not find suitable clusters")
|
||||
return None
|
||||
|
||||
def run_affinity_propagation(self):
|
||||
"""Run Affinity Propagation clustering"""
|
||||
print("\n" + "="*50)
|
||||
print("RUNNING AFFINITY PROPAGATION CLUSTERING")
|
||||
print("="*50)
|
||||
|
||||
# Calculate similarity matrix
|
||||
similarities = -np.sum((self.embeddings_scaled[:, np.newaxis] - self.embeddings_scaled)**2, axis=2)
|
||||
|
||||
# Try different preference values (percentiles of similarity matrix diagonal)
|
||||
preference_candidates = [
|
||||
np.percentile(similarities.diagonal(), 10),
|
||||
np.percentile(similarities.diagonal(), 25),
|
||||
np.percentile(similarities.diagonal(), 50),
|
||||
np.median(similarities),
|
||||
np.percentile(similarities.diagonal(), 75)
|
||||
]
|
||||
|
||||
damping_candidates = [0.5, 0.7, 0.8, 0.9]
|
||||
|
||||
best_score = -1
|
||||
best_params = None
|
||||
best_labels = None
|
||||
|
||||
print("Testing different parameter combinations...")
|
||||
for preference in preference_candidates:
|
||||
for damping in damping_candidates:
|
||||
try:
|
||||
affinity_prop = AffinityPropagation(
|
||||
preference=preference,
|
||||
damping=damping,
|
||||
random_state=42,
|
||||
max_iter=200
|
||||
)
|
||||
labels = affinity_prop.fit_predict(self.embeddings_scaled)
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
|
||||
score = silhouette_score(self.embeddings_scaled, labels)
|
||||
print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_params = (preference, damping)
|
||||
best_labels = labels
|
||||
except Exception as e:
|
||||
print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)")
|
||||
continue
|
||||
|
||||
if best_labels is not None:
|
||||
n_clusters = len(set(best_labels))
|
||||
print(f"\nBest Affinity Propagation result:")
|
||||
print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}")
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Silhouette score: {best_score:.4f}")
|
||||
|
||||
return best_labels
|
||||
else:
|
||||
print("Affinity Propagation could not find suitable clusters")
|
||||
return None
|
||||
|
||||
def visualize_results(self, results_dict):
|
||||
"""Visualize clustering results using PCA"""
|
||||
if not results_dict:
|
||||
print("No results to visualize")
|
||||
return
|
||||
|
||||
# Reduce dimensions for visualization
|
||||
pca = PCA(n_components=2, random_state=42)
|
||||
embeddings_2d = pca.fit_transform(self.embeddings_scaled)
|
||||
|
||||
n_methods = len(results_dict)
|
||||
fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4))
|
||||
|
||||
if n_methods == 1:
|
||||
axes = [axes]
|
||||
|
||||
for idx, (method_name, labels) in enumerate(results_dict.items()):
|
||||
# Handle noise points in DBSCAN (label -1)
|
||||
unique_labels = set(labels)
|
||||
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
|
||||
|
||||
for label, color in zip(unique_labels, colors):
|
||||
if label == -1:
|
||||
# Noise points in black
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c='black', marker='x', s=20, alpha=0.5, label='Noise')
|
||||
else:
|
||||
mask = labels == label
|
||||
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
||||
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
|
||||
|
||||
axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)')
|
||||
axes[idx].set_xlabel('PCA Component 1')
|
||||
axes[idx].set_ylabel('PCA Component 2')
|
||||
axes[idx].grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight')
|
||||
plt.show()
|
||||
|
||||
print(f"\nVisualization saved as 'auto_clustering_results.png'")
|
||||
|
||||
def save_results(self, results_dict):
|
||||
"""Save clustering results to JSON files"""
|
||||
print(results_dict.items())
|
||||
check_method_name = []
|
||||
print(len(results_dict))
|
||||
for method_name, labels in results_dict.items():
|
||||
check_method_name.append(method_name)
|
||||
# Create results for each method
|
||||
method_results = []
|
||||
print(method_name == 'DBSCAN')
|
||||
|
||||
|
||||
|
||||
for filepath, label in zip(self.file_paths, labels):
|
||||
if method_name == 'DBSCAN':
|
||||
if label == -1:
|
||||
is_noise = True
|
||||
else:
|
||||
is_noise = False
|
||||
else:
|
||||
is_noise = False
|
||||
|
||||
method_results.append({
|
||||
"filepath": filepath,
|
||||
"cluster": int(label),
|
||||
"is_noise": is_noise
|
||||
})
|
||||
print('method_name', set(check_method_name))
|
||||
print(method_results[0]['is_noise'])
|
||||
print(method_results[0])
|
||||
|
||||
# Save to file
|
||||
filename = f"{method_name.lower().replace(' ', '_')}_results.json"
|
||||
with open(filename, 'w') as f:
|
||||
json.dump({
|
||||
"method": method_name,
|
||||
"n_clusters": len(set(labels)) - (1 if -1 in labels else 0),
|
||||
"n_samples": len(labels),
|
||||
"results": method_results
|
||||
}, f, indent=4)
|
||||
|
||||
print(f"Results saved to {filename}")
|
||||
|
||||
def run_all_methods(self):
|
||||
"""Run all automatic clustering methods"""
|
||||
print("\n" + "="*70)
|
||||
print("AUTOMATIC CLUSTERING ANALYSIS")
|
||||
print("="*70)
|
||||
print(f"Dataset: {len(self.file_paths)} documents")
|
||||
print(f"Embedding dimension: {self.embeddings.shape[1]}")
|
||||
|
||||
results = {}
|
||||
|
||||
# Run DBSCAN
|
||||
dbscan_labels = self.run_dbscan()
|
||||
if dbscan_labels is not None:
|
||||
results["DBSCAN"] = dbscan_labels
|
||||
|
||||
# Run Mean Shift
|
||||
# meanshift_labels = self.run_mean_shift()
|
||||
# if meanshift_labels is not None:
|
||||
# results["Mean Shift"] = meanshift_labels
|
||||
|
||||
# Run Affinity Propagation
|
||||
# affinity_labels = self.run_affinity_propagation()
|
||||
# if affinity_labels is not None:
|
||||
# results["Affinity Propagation"] = affinity_labels
|
||||
|
||||
# Summary
|
||||
if results:
|
||||
print("\n" + "="*70)
|
||||
print("SUMMARY OF RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for method, labels in results.items():
|
||||
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
if method == "DBSCAN":
|
||||
n_noise = list(labels).count(-1)
|
||||
print(f"{method}: {n_clusters} clusters, {n_noise} noise points")
|
||||
else:
|
||||
print(f"{method}: {n_clusters} clusters")
|
||||
|
||||
# Calculate agreement between methods if multiple succeeded
|
||||
if len(results) > 1:
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
print("\nMethod Agreement (Adjusted Rand Index):")
|
||||
method_names = list(results.keys())
|
||||
for i in range(len(method_names)):
|
||||
for j in range(i+1, len(method_names)):
|
||||
ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]])
|
||||
print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}")
|
||||
|
||||
# Visualize and save results
|
||||
self.visualize_results(results)
|
||||
self.save_results(results)
|
||||
|
||||
else:
|
||||
print("\nNo automatic clustering method found suitable clusters.")
|
||||
print("This might indicate:")
|
||||
print("- Data doesn't have clear cluster structure")
|
||||
print("- Embeddings need different preprocessing")
|
||||
print("- Different parameter ranges needed")
|
||||
|
||||
return results
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings")
|
||||
parser.add_argument("--embeddings_path", help="Path to embeddings JSON file")
|
||||
parser.add_argument("--method", choices=['dbscan', 'meanshift', 'affinity', 'all'], default='all',
|
||||
help="Which automatic method to run")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize clustering
|
||||
clustering = AutoClustering(args.embeddings_path)
|
||||
|
||||
# Run selected method(s)
|
||||
if args.method == 'all':
|
||||
clustering.run_all_methods()
|
||||
elif args.method == 'dbscan':
|
||||
labels = clustering.run_dbscan()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"DBSCAN": labels})
|
||||
clustering.save_results({"DBSCAN": labels})
|
||||
elif args.method == 'meanshift':
|
||||
labels = clustering.run_mean_shift()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"Mean Shift": labels})
|
||||
clustering.save_results({"Mean Shift": labels})
|
||||
elif args.method == 'affinity':
|
||||
labels = clustering.run_affinity_propagation()
|
||||
if labels is not None:
|
||||
clustering.visualize_results({"Affinity Propagation": labels})
|
||||
clustering.save_results({"Affinity Propagation": labels})
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user