671 lines
29 KiB
Python
671 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation)
|
|
These methods don't require specifying the number of clusters beforehand.
|
|
"""
|
|
|
|
import json
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.metrics import silhouette_score
|
|
from sklearn.neighbors import NearestNeighbors
|
|
from sklearn.decomposition import PCA
|
|
import argparse
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
class AutoClustering:
|
|
def __init__(self, embeddings_path):
|
|
self.embeddings_path = embeddings_path
|
|
self.embeddings = None
|
|
self.file_paths = None
|
|
self.load_embeddings()
|
|
|
|
def load_embeddings(self):
|
|
"""Load embeddings from JSON file"""
|
|
print(f"Loading embeddings from {self.embeddings_path}...")
|
|
with open(self.embeddings_path, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
self.file_paths = []
|
|
embeddings_list = []
|
|
|
|
for item in data:
|
|
self.file_paths.append(item['filepath'])
|
|
embeddings_list.append(item['embedding'])
|
|
|
|
self.embeddings = np.array(embeddings_list, dtype=np.float32)
|
|
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
|
|
|
|
# Standardize embeddings for better clustering
|
|
self.scaler = StandardScaler()
|
|
self.embeddings_scaled = self.scaler.fit_transform(self.embeddings)
|
|
|
|
def run_dbscan(self):
|
|
"""Run DBSCAN with extensive grid search for parameter estimation"""
|
|
print("\n" + "="*50)
|
|
print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH")
|
|
print("="*50)
|
|
|
|
# Method 1: K-nearest neighbors approach with multiple k values
|
|
eps_candidates = []
|
|
|
|
# Try different k values for nearest neighbors
|
|
k_values = [5, 10, 15, 20, 25, 30]
|
|
for k in k_values:
|
|
k_actual = min(k, len(self.embeddings_scaled) // 4)
|
|
if k_actual < 3:
|
|
continue
|
|
|
|
neighbors = NearestNeighbors(n_neighbors=k_actual)
|
|
neighbors_fit = neighbors.fit(self.embeddings_scaled)
|
|
distances, indices = neighbors_fit.kneighbors(self.embeddings_scaled)
|
|
|
|
# Sort distances and use k-th nearest neighbor distance
|
|
distances = np.sort(distances, axis=0)
|
|
kth_distances = distances[:, k_actual-1]
|
|
|
|
# Multiple percentile thresholds for each k
|
|
percentiles = [60, 65, 70, 75, 80, 85, 90, 95]
|
|
for p in percentiles:
|
|
eps_candidates.append(np.percentile(kth_distances, p))
|
|
|
|
# Method 2: Statistical measures
|
|
# Mean and std of pairwise distances (sampled for efficiency)
|
|
sample_size = min(1000, len(self.embeddings_scaled))
|
|
sample_indices = np.random.choice(len(self.embeddings_scaled), sample_size, replace=False)
|
|
sample_data = self.embeddings_scaled[sample_indices]
|
|
|
|
from scipy.spatial.distance import pdist
|
|
pairwise_distances = pdist(sample_data)
|
|
|
|
# Add statistical measures as eps candidates
|
|
eps_candidates.extend([
|
|
np.mean(pairwise_distances) * 0.3,
|
|
np.mean(pairwise_distances) * 0.4,
|
|
np.mean(pairwise_distances) * 0.5,
|
|
np.mean(pairwise_distances) * 0.6,
|
|
np.mean(pairwise_distances) * 0.7,
|
|
np.median(pairwise_distances) * 0.3,
|
|
np.median(pairwise_distances) * 0.4,
|
|
np.median(pairwise_distances) * 0.5,
|
|
np.median(pairwise_distances) * 0.6,
|
|
np.std(pairwise_distances) * 0.5,
|
|
np.std(pairwise_distances) * 0.8,
|
|
np.std(pairwise_distances) * 1.0,
|
|
np.std(pairwise_distances) * 1.2
|
|
])
|
|
|
|
# Method 3: Manual eps values for different scales
|
|
manual_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
|
|
1.2, 1.5, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0]
|
|
eps_candidates.extend(manual_eps)
|
|
|
|
# Remove duplicates and invalid values, then sort
|
|
eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0])))
|
|
|
|
# Extensive min_samples candidates
|
|
min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50]
|
|
|
|
# Filter min_samples based on dataset size
|
|
# max_min_samples = len(self.embeddings_scaled) // 10 # At most 10% of data
|
|
# min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples]
|
|
min_samples_candidates = [60]
|
|
|
|
|
|
best_score = -1
|
|
best_params = None
|
|
best_labels = None
|
|
|
|
print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values")
|
|
print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}")
|
|
print("This may take a while...\n")
|
|
|
|
# Track all results for analysis
|
|
all_results = []
|
|
|
|
total_combinations = len(eps_candidates) * len(min_samples_candidates)
|
|
current_combination = 0
|
|
|
|
for eps in eps_candidates:
|
|
for min_samples in min_samples_candidates:
|
|
current_combination += 1
|
|
|
|
# Progress indicator
|
|
if current_combination % 50 == 0 or current_combination == total_combinations:
|
|
progress = (current_combination / total_combinations) * 100
|
|
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)")
|
|
|
|
try:
|
|
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
|
labels = dbscan.fit_predict(self.embeddings_scaled)
|
|
|
|
# Check if we have meaningful clusters
|
|
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
n_noise = list(labels).count(-1)
|
|
noise_ratio = n_noise / len(labels)
|
|
|
|
# Store result for analysis
|
|
result_info = {
|
|
'eps': eps,
|
|
'min_samples': min_samples,
|
|
'n_clusters': n_clusters,
|
|
'n_noise': n_noise,
|
|
'noise_ratio': noise_ratio
|
|
}
|
|
|
|
# Check if we have meaningful clusters
|
|
if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points
|
|
# Calculate silhouette score (excluding noise)
|
|
mask = labels != -1
|
|
if np.sum(mask) > 1:
|
|
try:
|
|
score = silhouette_score(self.embeddings_scaled[mask], labels[mask])
|
|
result_info['silhouette_score'] = score
|
|
|
|
# Print promising results
|
|
if score > 0.1: # Only show decent scores
|
|
print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}")
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_params = (eps, min_samples)
|
|
best_labels = labels
|
|
except Exception:
|
|
result_info['silhouette_score'] = None
|
|
else:
|
|
result_info['silhouette_score'] = None
|
|
|
|
all_results.append(result_info)
|
|
|
|
except Exception as e:
|
|
# Skip problematic parameter combinations
|
|
continue
|
|
|
|
# Analysis of results
|
|
print("\n" + "="*50)
|
|
print("DBSCAN GRID SEARCH ANALYSIS")
|
|
print("="*50)
|
|
|
|
if all_results:
|
|
# Convert to numpy for easier analysis
|
|
import pandas as pd
|
|
df_results = pd.DataFrame(all_results)
|
|
|
|
print(f"Total parameter combinations tested: {len(df_results)}")
|
|
|
|
# Valid results (with clusters)
|
|
valid_results = df_results[df_results['n_clusters'] >= 2]
|
|
print(f"Combinations that produced clusters: {len(valid_results)}")
|
|
|
|
if len(valid_results) > 0:
|
|
# Best silhouette scores
|
|
scored_results = valid_results.dropna(subset=['silhouette_score'])
|
|
if len(scored_results) > 0:
|
|
print(f"Combinations with valid silhouette scores: {len(scored_results)}")
|
|
print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}")
|
|
print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}")
|
|
|
|
# Top 5 results
|
|
top_results = scored_results.nlargest(5, 'silhouette_score')
|
|
print("\nTop 5 parameter combinations:")
|
|
for idx, row in top_results.iterrows():
|
|
print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: "
|
|
f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}")
|
|
|
|
# Cluster count distribution
|
|
cluster_counts = valid_results['n_clusters'].value_counts().sort_index()
|
|
print(f"\nCluster count distribution:")
|
|
for n_clusters, count in cluster_counts.items():
|
|
print(f" {n_clusters} clusters: {count} parameter combinations")
|
|
|
|
print(f"\n📁 SAVING DETAILED RESULTS...")
|
|
print("="*30)
|
|
|
|
# Save detailed grid search results to JSON file
|
|
self.save_dbscan_grid_search_results(all_results, best_params, best_score)
|
|
|
|
if best_labels is not None:
|
|
n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0)
|
|
n_noise = list(best_labels).count(-1)
|
|
|
|
print(f"\nBest DBSCAN result:")
|
|
print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}")
|
|
print(f"Number of clusters: {n_clusters}")
|
|
print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)")
|
|
print(f"Silhouette score: {best_score:.4f}")
|
|
|
|
return best_labels
|
|
else:
|
|
print("DBSCAN could not find suitable clusters with the extensive grid search")
|
|
print("Consider:")
|
|
print("- Adjusting the embedding space (different model or preprocessing)")
|
|
print("- Using different clustering algorithms")
|
|
print("- Manual parameter tuning based on domain knowledge")
|
|
return None
|
|
|
|
def save_dbscan_grid_search_results(self, all_results, best_params, best_score):
|
|
"""Save detailed DBSCAN grid search results to JSON file"""
|
|
import datetime
|
|
|
|
# Prepare comprehensive results data
|
|
grid_search_data = {
|
|
"experiment_info": {
|
|
"timestamp": datetime.datetime.now().isoformat(),
|
|
"dataset_path": self.embeddings_path,
|
|
"total_samples": len(self.file_paths),
|
|
"embedding_dimension": self.embeddings.shape[1],
|
|
"total_combinations_tested": len(all_results)
|
|
},
|
|
"best_result": {
|
|
"eps": best_params[0] if best_params else None,
|
|
"min_samples": best_params[1] if best_params else None,
|
|
"silhouette_score": best_score if best_score > -1 else None
|
|
},
|
|
"all_trials": []
|
|
}
|
|
|
|
# Add all trial results
|
|
for i, result in enumerate(all_results):
|
|
trial_data = {
|
|
"trial_id": i + 1,
|
|
"parameters": {
|
|
"eps": result['eps'],
|
|
"min_samples": result['min_samples']
|
|
},
|
|
"results": {
|
|
"n_clusters": result['n_clusters'],
|
|
"n_noise": result['n_noise'],
|
|
"noise_ratio": result['noise_ratio'],
|
|
"silhouette_score": result['silhouette_score']
|
|
},
|
|
"status": "success" if result['silhouette_score'] is not None else "failed"
|
|
}
|
|
grid_search_data["all_trials"].append(trial_data)
|
|
|
|
# Calculate summary statistics
|
|
valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"]
|
|
if valid_trials:
|
|
silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None]
|
|
if silhouette_scores:
|
|
grid_search_data["summary_statistics"] = {
|
|
"total_trials": len(all_results),
|
|
"successful_trials": len(valid_trials),
|
|
"success_rate": len(valid_trials) / len(all_results),
|
|
"best_silhouette_score": max(silhouette_scores),
|
|
"worst_silhouette_score": min(silhouette_scores),
|
|
"mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores),
|
|
"median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2]
|
|
}
|
|
|
|
# Top 10 results
|
|
sorted_valid_trials = sorted(valid_trials,
|
|
key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1,
|
|
reverse=True)
|
|
grid_search_data["top_10_results"] = sorted_valid_trials[:10]
|
|
|
|
# Parameter analysis
|
|
eps_values = [t["parameters"]["eps"] for t in valid_trials]
|
|
min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials]
|
|
|
|
grid_search_data["parameter_analysis"] = {
|
|
"eps_range": {
|
|
"min": min(eps_values),
|
|
"max": max(eps_values),
|
|
"mean": sum(eps_values) / len(eps_values)
|
|
},
|
|
"min_samples_range": {
|
|
"min": min(min_samples_values),
|
|
"max": max(min_samples_values),
|
|
"mean": sum(min_samples_values) / len(min_samples_values)
|
|
}
|
|
}
|
|
|
|
# Save to file with timestamp
|
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"dbscan_grid_search_detailed_{timestamp}.json"
|
|
|
|
with open(filename, 'w') as f:
|
|
json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
|
|
|
|
print(f"Detailed grid search results saved to: {filename}")
|
|
|
|
# Also save a CSV summary for easy analysis
|
|
csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv"
|
|
self.save_grid_search_csv(all_results, csv_filename)
|
|
print(f"Grid search summary CSV saved to: {csv_filename}")
|
|
|
|
def save_grid_search_csv(self, all_results, filename):
|
|
"""Save grid search results as CSV for easy analysis"""
|
|
import csv
|
|
|
|
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise',
|
|
'noise_ratio', 'silhouette_score', 'status']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
|
|
writer.writeheader()
|
|
for i, result in enumerate(all_results):
|
|
writer.writerow({
|
|
'trial_id': i + 1,
|
|
'eps': result['eps'],
|
|
'min_samples': result['min_samples'],
|
|
'n_clusters': result['n_clusters'],
|
|
'n_noise': result['n_noise'],
|
|
'noise_ratio': result['noise_ratio'],
|
|
'silhouette_score': result['silhouette_score'],
|
|
'status': 'success' if result['silhouette_score'] is not None else 'failed'
|
|
})
|
|
|
|
def run_mean_shift(self):
|
|
"""Run Mean Shift clustering"""
|
|
print("\n" + "="*50)
|
|
print("RUNNING MEAN SHIFT CLUSTERING")
|
|
print("="*50)
|
|
|
|
# Estimate bandwidth using different percentiles
|
|
from sklearn.cluster import estimate_bandwidth
|
|
|
|
# Try different bandwidth estimation methods
|
|
bandwidth_candidates = []
|
|
|
|
# Method 1: sklearn's estimate_bandwidth
|
|
try:
|
|
bw_est = estimate_bandwidth(self.embeddings_scaled, quantile=0.3, n_samples=min(500, len(self.embeddings_scaled)))
|
|
if bw_est > 0:
|
|
bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5])
|
|
except:
|
|
pass
|
|
|
|
# Method 2: nearest neighbor distances
|
|
neighbors = NearestNeighbors(n_neighbors=10)
|
|
neighbors_fit = neighbors.fit(self.embeddings_scaled)
|
|
distances, _ = neighbors_fit.kneighbors(self.embeddings_scaled)
|
|
mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance
|
|
bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5])
|
|
|
|
# Remove duplicates and invalid values
|
|
bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0]))
|
|
|
|
if not bandwidth_candidates:
|
|
bandwidth_candidates = [0.5, 1.0, 1.5, 2.0]
|
|
|
|
best_score = -1
|
|
best_bandwidth = None
|
|
best_labels = None
|
|
|
|
print("Testing different bandwidth values...")
|
|
for bandwidth in bandwidth_candidates:
|
|
try:
|
|
mean_shift = MeanShift(bandwidth=bandwidth)
|
|
labels = mean_shift.fit_predict(self.embeddings_scaled)
|
|
|
|
n_clusters = len(set(labels))
|
|
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
|
|
score = silhouette_score(self.embeddings_scaled, labels)
|
|
print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}")
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_bandwidth = bandwidth
|
|
best_labels = labels
|
|
except Exception as e:
|
|
print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)")
|
|
continue
|
|
|
|
if best_labels is not None:
|
|
n_clusters = len(set(best_labels))
|
|
print(f"\nBest Mean Shift result:")
|
|
print(f"Bandwidth: {best_bandwidth:.4f}")
|
|
print(f"Number of clusters: {n_clusters}")
|
|
print(f"Silhouette score: {best_score:.4f}")
|
|
|
|
return best_labels
|
|
else:
|
|
print("Mean Shift could not find suitable clusters")
|
|
return None
|
|
|
|
def run_affinity_propagation(self):
|
|
"""Run Affinity Propagation clustering"""
|
|
print("\n" + "="*50)
|
|
print("RUNNING AFFINITY PROPAGATION CLUSTERING")
|
|
print("="*50)
|
|
|
|
# Calculate similarity matrix
|
|
similarities = -np.sum((self.embeddings_scaled[:, np.newaxis] - self.embeddings_scaled)**2, axis=2)
|
|
|
|
# Try different preference values (percentiles of similarity matrix diagonal)
|
|
preference_candidates = [
|
|
np.percentile(similarities.diagonal(), 10),
|
|
np.percentile(similarities.diagonal(), 25),
|
|
np.percentile(similarities.diagonal(), 50),
|
|
np.median(similarities),
|
|
np.percentile(similarities.diagonal(), 75)
|
|
]
|
|
|
|
damping_candidates = [0.5, 0.7, 0.8, 0.9]
|
|
|
|
best_score = -1
|
|
best_params = None
|
|
best_labels = None
|
|
|
|
print("Testing different parameter combinations...")
|
|
for preference in preference_candidates:
|
|
for damping in damping_candidates:
|
|
try:
|
|
affinity_prop = AffinityPropagation(
|
|
preference=preference,
|
|
damping=damping,
|
|
random_state=42,
|
|
max_iter=200
|
|
)
|
|
labels = affinity_prop.fit_predict(self.embeddings_scaled)
|
|
|
|
n_clusters = len(set(labels))
|
|
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
|
|
score = silhouette_score(self.embeddings_scaled, labels)
|
|
print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}")
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_params = (preference, damping)
|
|
best_labels = labels
|
|
except Exception as e:
|
|
print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)")
|
|
continue
|
|
|
|
if best_labels is not None:
|
|
n_clusters = len(set(best_labels))
|
|
print(f"\nBest Affinity Propagation result:")
|
|
print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}")
|
|
print(f"Number of clusters: {n_clusters}")
|
|
print(f"Silhouette score: {best_score:.4f}")
|
|
|
|
return best_labels
|
|
else:
|
|
print("Affinity Propagation could not find suitable clusters")
|
|
return None
|
|
|
|
def visualize_results(self, results_dict):
|
|
"""Visualize clustering results using PCA"""
|
|
if not results_dict:
|
|
print("No results to visualize")
|
|
return
|
|
|
|
# Reduce dimensions for visualization
|
|
pca = PCA(n_components=2, random_state=42)
|
|
embeddings_2d = pca.fit_transform(self.embeddings_scaled)
|
|
|
|
n_methods = len(results_dict)
|
|
fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4))
|
|
|
|
if n_methods == 1:
|
|
axes = [axes]
|
|
|
|
for idx, (method_name, labels) in enumerate(results_dict.items()):
|
|
# Handle noise points in DBSCAN (label -1)
|
|
unique_labels = set(labels)
|
|
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
|
|
|
|
for label, color in zip(unique_labels, colors):
|
|
if label == -1:
|
|
# Noise points in black
|
|
mask = labels == label
|
|
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
|
c='black', marker='x', s=20, alpha=0.5, label='Noise')
|
|
else:
|
|
mask = labels == label
|
|
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
|
|
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
|
|
|
|
axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)')
|
|
axes[idx].set_xlabel('PCA Component 1')
|
|
axes[idx].set_ylabel('PCA Component 2')
|
|
axes[idx].grid(True, alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight')
|
|
plt.show()
|
|
|
|
print(f"\nVisualization saved as 'auto_clustering_results.png'")
|
|
|
|
def save_results(self, results_dict):
|
|
"""Save clustering results to JSON files"""
|
|
print(results_dict.items())
|
|
check_method_name = []
|
|
print(len(results_dict))
|
|
for method_name, labels in results_dict.items():
|
|
check_method_name.append(method_name)
|
|
# Create results for each method
|
|
method_results = []
|
|
print(method_name == 'DBSCAN')
|
|
|
|
|
|
|
|
for filepath, label in zip(self.file_paths, labels):
|
|
if method_name == 'DBSCAN':
|
|
if label == -1:
|
|
is_noise = True
|
|
else:
|
|
is_noise = False
|
|
else:
|
|
is_noise = False
|
|
|
|
method_results.append({
|
|
"filepath": filepath,
|
|
"cluster": int(label),
|
|
"is_noise": is_noise
|
|
})
|
|
print('method_name', set(check_method_name))
|
|
print(method_results[0]['is_noise'])
|
|
print(method_results[0])
|
|
|
|
# Save to file
|
|
filename = f"{method_name.lower().replace(' ', '_')}_results.json"
|
|
with open(filename, 'w') as f:
|
|
json.dump({
|
|
"method": method_name,
|
|
"n_clusters": len(set(labels)) - (1 if -1 in labels else 0),
|
|
"n_samples": len(labels),
|
|
"results": method_results
|
|
}, f, indent=4)
|
|
|
|
print(f"Results saved to {filename}")
|
|
|
|
def run_all_methods(self):
|
|
"""Run all automatic clustering methods"""
|
|
print("\n" + "="*70)
|
|
print("AUTOMATIC CLUSTERING ANALYSIS")
|
|
print("="*70)
|
|
print(f"Dataset: {len(self.file_paths)} documents")
|
|
print(f"Embedding dimension: {self.embeddings.shape[1]}")
|
|
|
|
results = {}
|
|
|
|
# Run DBSCAN
|
|
dbscan_labels = self.run_dbscan()
|
|
if dbscan_labels is not None:
|
|
results["DBSCAN"] = dbscan_labels
|
|
|
|
# Run Mean Shift
|
|
# meanshift_labels = self.run_mean_shift()
|
|
# if meanshift_labels is not None:
|
|
# results["Mean Shift"] = meanshift_labels
|
|
|
|
# Run Affinity Propagation
|
|
# affinity_labels = self.run_affinity_propagation()
|
|
# if affinity_labels is not None:
|
|
# results["Affinity Propagation"] = affinity_labels
|
|
|
|
# Summary
|
|
if results:
|
|
print("\n" + "="*70)
|
|
print("SUMMARY OF RESULTS")
|
|
print("="*70)
|
|
|
|
for method, labels in results.items():
|
|
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
if method == "DBSCAN":
|
|
n_noise = list(labels).count(-1)
|
|
print(f"{method}: {n_clusters} clusters, {n_noise} noise points")
|
|
else:
|
|
print(f"{method}: {n_clusters} clusters")
|
|
|
|
# Calculate agreement between methods if multiple succeeded
|
|
if len(results) > 1:
|
|
from sklearn.metrics import adjusted_rand_score
|
|
print("\nMethod Agreement (Adjusted Rand Index):")
|
|
method_names = list(results.keys())
|
|
for i in range(len(method_names)):
|
|
for j in range(i+1, len(method_names)):
|
|
ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]])
|
|
print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}")
|
|
|
|
# Visualize and save results
|
|
self.visualize_results(results)
|
|
self.save_results(results)
|
|
|
|
else:
|
|
print("\nNo automatic clustering method found suitable clusters.")
|
|
print("This might indicate:")
|
|
print("- Data doesn't have clear cluster structure")
|
|
print("- Embeddings need different preprocessing")
|
|
print("- Different parameter ranges needed")
|
|
|
|
return results
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings")
|
|
parser.add_argument("--embeddings_path", help="Path to embeddings JSON file")
|
|
parser.add_argument("--method", choices=['dbscan', 'meanshift', 'affinity', 'all'], default='all',
|
|
help="Which automatic method to run")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Initialize clustering
|
|
clustering = AutoClustering(args.embeddings_path)
|
|
|
|
# Run selected method(s)
|
|
if args.method == 'all':
|
|
clustering.run_all_methods()
|
|
elif args.method == 'dbscan':
|
|
labels = clustering.run_dbscan()
|
|
if labels is not None:
|
|
clustering.visualize_results({"DBSCAN": labels})
|
|
clustering.save_results({"DBSCAN": labels})
|
|
elif args.method == 'meanshift':
|
|
labels = clustering.run_mean_shift()
|
|
if labels is not None:
|
|
clustering.visualize_results({"Mean Shift": labels})
|
|
clustering.save_results({"Mean Shift": labels})
|
|
elif args.method == 'affinity':
|
|
labels = clustering.run_affinity_propagation()
|
|
if labels is not None:
|
|
clustering.visualize_results({"Affinity Propagation": labels})
|
|
clustering.save_results({"Affinity Propagation": labels})
|
|
|
|
if __name__ == "__main__":
|
|
main()
|