Files
embedding-clustering/cluster/auto_cluster copy.py

671 lines
29 KiB
Python
Raw Normal View History

2025-09-04 14:39:02 +00:00
#!/usr/bin/env python3
"""
Simple script to run automatic clustering methods (DBSCAN, Mean Shift, Affinity Propagation)
These methods don't require specifying the number of clusters beforehand.
"""
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, MeanShift, AffinityPropagation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import argparse
import warnings
warnings.filterwarnings('ignore')
class AutoClustering:
def __init__(self, embeddings_path):
self.embeddings_path = embeddings_path
self.embeddings = None
self.file_paths = None
self.load_embeddings()
def load_embeddings(self):
"""Load embeddings from JSON file"""
print(f"Loading embeddings from {self.embeddings_path}...")
with open(self.embeddings_path, 'r') as f:
data = json.load(f)
self.file_paths = []
embeddings_list = []
for item in data:
self.file_paths.append(item['filepath'])
embeddings_list.append(item['embedding'])
self.embeddings = np.array(embeddings_list, dtype=np.float32)
print(f"Loaded {len(self.file_paths)} samples with embedding dimension {self.embeddings.shape[1]}")
# Standardize embeddings for better clustering
self.scaler = StandardScaler()
self.embeddings_scaled = self.scaler.fit_transform(self.embeddings)
def run_dbscan(self):
"""Run DBSCAN with extensive grid search for parameter estimation"""
print("\n" + "="*50)
print("RUNNING DBSCAN CLUSTERING WITH EXTENSIVE GRID SEARCH")
print("="*50)
# Method 1: K-nearest neighbors approach with multiple k values
eps_candidates = []
# Try different k values for nearest neighbors
k_values = [5, 10, 15, 20, 25, 30]
for k in k_values:
k_actual = min(k, len(self.embeddings_scaled) // 4)
if k_actual < 3:
continue
neighbors = NearestNeighbors(n_neighbors=k_actual)
neighbors_fit = neighbors.fit(self.embeddings_scaled)
distances, indices = neighbors_fit.kneighbors(self.embeddings_scaled)
# Sort distances and use k-th nearest neighbor distance
distances = np.sort(distances, axis=0)
kth_distances = distances[:, k_actual-1]
# Multiple percentile thresholds for each k
percentiles = [60, 65, 70, 75, 80, 85, 90, 95]
for p in percentiles:
eps_candidates.append(np.percentile(kth_distances, p))
# Method 2: Statistical measures
# Mean and std of pairwise distances (sampled for efficiency)
sample_size = min(1000, len(self.embeddings_scaled))
sample_indices = np.random.choice(len(self.embeddings_scaled), sample_size, replace=False)
sample_data = self.embeddings_scaled[sample_indices]
from scipy.spatial.distance import pdist
pairwise_distances = pdist(sample_data)
# Add statistical measures as eps candidates
eps_candidates.extend([
np.mean(pairwise_distances) * 0.3,
np.mean(pairwise_distances) * 0.4,
np.mean(pairwise_distances) * 0.5,
np.mean(pairwise_distances) * 0.6,
np.mean(pairwise_distances) * 0.7,
np.median(pairwise_distances) * 0.3,
np.median(pairwise_distances) * 0.4,
np.median(pairwise_distances) * 0.5,
np.median(pairwise_distances) * 0.6,
np.std(pairwise_distances) * 0.5,
np.std(pairwise_distances) * 0.8,
np.std(pairwise_distances) * 1.0,
np.std(pairwise_distances) * 1.2
])
# Method 3: Manual eps values for different scales
manual_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.2, 1.5, 1.8, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0]
eps_candidates.extend(manual_eps)
# Remove duplicates and invalid values, then sort
eps_candidates = sorted(list(set([eps for eps in eps_candidates if eps > 0])))
# Extensive min_samples candidates
min_samples_candidates = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20, 25, 30, 35, 40, 50]
# Filter min_samples based on dataset size
# max_min_samples = len(self.embeddings_scaled) // 10 # At most 10% of data
# min_samples_candidates = [ms for ms in min_samples_candidates if ms <= max_min_samples]
min_samples_candidates = [60]
best_score = -1
best_params = None
best_labels = None
print(f"Testing {len(eps_candidates)} eps values x {len(min_samples_candidates)} min_samples values")
print(f"Total combinations: {len(eps_candidates) * len(min_samples_candidates)}")
print("This may take a while...\n")
# Track all results for analysis
all_results = []
total_combinations = len(eps_candidates) * len(min_samples_candidates)
current_combination = 0
for eps in eps_candidates:
for min_samples in min_samples_candidates:
current_combination += 1
# Progress indicator
if current_combination % 50 == 0 or current_combination == total_combinations:
progress = (current_combination / total_combinations) * 100
print(f"Progress: {current_combination}/{total_combinations} ({progress:.1f}%)")
try:
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(self.embeddings_scaled)
# Check if we have meaningful clusters
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
noise_ratio = n_noise / len(labels)
# Store result for analysis
result_info = {
'eps': eps,
'min_samples': min_samples,
'n_clusters': n_clusters,
'n_noise': n_noise,
'noise_ratio': noise_ratio
}
# Check if we have meaningful clusters
if n_clusters >= 2 and noise_ratio < 0.9: # Not too many noise points
# Calculate silhouette score (excluding noise)
mask = labels != -1
if np.sum(mask) > 1:
try:
score = silhouette_score(self.embeddings_scaled[mask], labels[mask])
result_info['silhouette_score'] = score
# Print promising results
if score > 0.1: # Only show decent scores
print(f"eps={eps:.4f}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({noise_ratio:.1%}), silhouette={score:.4f}")
if score > best_score:
best_score = score
best_params = (eps, min_samples)
best_labels = labels
except Exception:
result_info['silhouette_score'] = None
else:
result_info['silhouette_score'] = None
all_results.append(result_info)
except Exception as e:
# Skip problematic parameter combinations
continue
# Analysis of results
print("\n" + "="*50)
print("DBSCAN GRID SEARCH ANALYSIS")
print("="*50)
if all_results:
# Convert to numpy for easier analysis
import pandas as pd
df_results = pd.DataFrame(all_results)
print(f"Total parameter combinations tested: {len(df_results)}")
# Valid results (with clusters)
valid_results = df_results[df_results['n_clusters'] >= 2]
print(f"Combinations that produced clusters: {len(valid_results)}")
if len(valid_results) > 0:
# Best silhouette scores
scored_results = valid_results.dropna(subset=['silhouette_score'])
if len(scored_results) > 0:
print(f"Combinations with valid silhouette scores: {len(scored_results)}")
print(f"Best silhouette score: {scored_results['silhouette_score'].max():.4f}")
print(f"Mean silhouette score: {scored_results['silhouette_score'].mean():.4f}")
# Top 5 results
top_results = scored_results.nlargest(5, 'silhouette_score')
print("\nTop 5 parameter combinations:")
for idx, row in top_results.iterrows():
print(f" eps={row['eps']:.4f}, min_samples={row['min_samples']}: "
f"{row['n_clusters']} clusters, silhouette={row['silhouette_score']:.4f}")
# Cluster count distribution
cluster_counts = valid_results['n_clusters'].value_counts().sort_index()
print(f"\nCluster count distribution:")
for n_clusters, count in cluster_counts.items():
print(f" {n_clusters} clusters: {count} parameter combinations")
print(f"\n📁 SAVING DETAILED RESULTS...")
print("="*30)
# Save detailed grid search results to JSON file
self.save_dbscan_grid_search_results(all_results, best_params, best_score)
if best_labels is not None:
n_clusters = len(set(best_labels)) - (1 if -1 in best_labels else 0)
n_noise = list(best_labels).count(-1)
print(f"\nBest DBSCAN result:")
print(f"Parameters: eps={best_params[0]:.4f}, min_samples={best_params[1]}")
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise} ({n_noise/len(best_labels)*100:.1f}%)")
print(f"Silhouette score: {best_score:.4f}")
return best_labels
else:
print("DBSCAN could not find suitable clusters with the extensive grid search")
print("Consider:")
print("- Adjusting the embedding space (different model or preprocessing)")
print("- Using different clustering algorithms")
print("- Manual parameter tuning based on domain knowledge")
return None
def save_dbscan_grid_search_results(self, all_results, best_params, best_score):
"""Save detailed DBSCAN grid search results to JSON file"""
import datetime
# Prepare comprehensive results data
grid_search_data = {
"experiment_info": {
"timestamp": datetime.datetime.now().isoformat(),
"dataset_path": self.embeddings_path,
"total_samples": len(self.file_paths),
"embedding_dimension": self.embeddings.shape[1],
"total_combinations_tested": len(all_results)
},
"best_result": {
"eps": best_params[0] if best_params else None,
"min_samples": best_params[1] if best_params else None,
"silhouette_score": best_score if best_score > -1 else None
},
"all_trials": []
}
# Add all trial results
for i, result in enumerate(all_results):
trial_data = {
"trial_id": i + 1,
"parameters": {
"eps": result['eps'],
"min_samples": result['min_samples']
},
"results": {
"n_clusters": result['n_clusters'],
"n_noise": result['n_noise'],
"noise_ratio": result['noise_ratio'],
"silhouette_score": result['silhouette_score']
},
"status": "success" if result['silhouette_score'] is not None else "failed"
}
grid_search_data["all_trials"].append(trial_data)
# Calculate summary statistics
valid_trials = [t for t in grid_search_data["all_trials"] if t["status"] == "success"]
if valid_trials:
silhouette_scores = [t["results"]["silhouette_score"] for t in valid_trials if t["results"]["silhouette_score"] is not None]
if silhouette_scores:
grid_search_data["summary_statistics"] = {
"total_trials": len(all_results),
"successful_trials": len(valid_trials),
"success_rate": len(valid_trials) / len(all_results),
"best_silhouette_score": max(silhouette_scores),
"worst_silhouette_score": min(silhouette_scores),
"mean_silhouette_score": sum(silhouette_scores) / len(silhouette_scores),
"median_silhouette_score": sorted(silhouette_scores)[len(silhouette_scores)//2]
}
# Top 10 results
sorted_valid_trials = sorted(valid_trials,
key=lambda x: x["results"]["silhouette_score"] if x["results"]["silhouette_score"] is not None else -1,
reverse=True)
grid_search_data["top_10_results"] = sorted_valid_trials[:10]
# Parameter analysis
eps_values = [t["parameters"]["eps"] for t in valid_trials]
min_samples_values = [t["parameters"]["min_samples"] for t in valid_trials]
grid_search_data["parameter_analysis"] = {
"eps_range": {
"min": min(eps_values),
"max": max(eps_values),
"mean": sum(eps_values) / len(eps_values)
},
"min_samples_range": {
"min": min(min_samples_values),
"max": max(min_samples_values),
"mean": sum(min_samples_values) / len(min_samples_values)
}
}
# Save to file with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"dbscan_grid_search_detailed_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(grid_search_data, f, indent=4, ensure_ascii=False)
print(f"Detailed grid search results saved to: {filename}")
# Also save a CSV summary for easy analysis
csv_filename = f"dbscan_grid_search_summary_{timestamp}.csv"
self.save_grid_search_csv(all_results, csv_filename)
print(f"Grid search summary CSV saved to: {csv_filename}")
def save_grid_search_csv(self, all_results, filename):
"""Save grid search results as CSV for easy analysis"""
import csv
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['trial_id', 'eps', 'min_samples', 'n_clusters', 'n_noise',
'noise_ratio', 'silhouette_score', 'status']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i, result in enumerate(all_results):
writer.writerow({
'trial_id': i + 1,
'eps': result['eps'],
'min_samples': result['min_samples'],
'n_clusters': result['n_clusters'],
'n_noise': result['n_noise'],
'noise_ratio': result['noise_ratio'],
'silhouette_score': result['silhouette_score'],
'status': 'success' if result['silhouette_score'] is not None else 'failed'
})
def run_mean_shift(self):
"""Run Mean Shift clustering"""
print("\n" + "="*50)
print("RUNNING MEAN SHIFT CLUSTERING")
print("="*50)
# Estimate bandwidth using different percentiles
from sklearn.cluster import estimate_bandwidth
# Try different bandwidth estimation methods
bandwidth_candidates = []
# Method 1: sklearn's estimate_bandwidth
try:
bw_est = estimate_bandwidth(self.embeddings_scaled, quantile=0.3, n_samples=min(500, len(self.embeddings_scaled)))
if bw_est > 0:
bandwidth_candidates.extend([bw_est * 0.5, bw_est, bw_est * 1.5])
except:
pass
# Method 2: nearest neighbor distances
neighbors = NearestNeighbors(n_neighbors=10)
neighbors_fit = neighbors.fit(self.embeddings_scaled)
distances, _ = neighbors_fit.kneighbors(self.embeddings_scaled)
mean_dist = np.mean(distances[:, 1:]) # Exclude self-distance
bandwidth_candidates.extend([mean_dist * 0.5, mean_dist, mean_dist * 1.5])
# Remove duplicates and invalid values
bandwidth_candidates = list(set([bw for bw in bandwidth_candidates if bw > 0]))
if not bandwidth_candidates:
bandwidth_candidates = [0.5, 1.0, 1.5, 2.0]
best_score = -1
best_bandwidth = None
best_labels = None
print("Testing different bandwidth values...")
for bandwidth in bandwidth_candidates:
try:
mean_shift = MeanShift(bandwidth=bandwidth)
labels = mean_shift.fit_predict(self.embeddings_scaled)
n_clusters = len(set(labels))
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
score = silhouette_score(self.embeddings_scaled, labels)
print(f"bandwidth={bandwidth:.4f}: {n_clusters} clusters, silhouette={score:.4f}")
if score > best_score:
best_score = score
best_bandwidth = bandwidth
best_labels = labels
except Exception as e:
print(f"bandwidth={bandwidth:.4f}: failed ({str(e)[:50]}...)")
continue
if best_labels is not None:
n_clusters = len(set(best_labels))
print(f"\nBest Mean Shift result:")
print(f"Bandwidth: {best_bandwidth:.4f}")
print(f"Number of clusters: {n_clusters}")
print(f"Silhouette score: {best_score:.4f}")
return best_labels
else:
print("Mean Shift could not find suitable clusters")
return None
def run_affinity_propagation(self):
"""Run Affinity Propagation clustering"""
print("\n" + "="*50)
print("RUNNING AFFINITY PROPAGATION CLUSTERING")
print("="*50)
# Calculate similarity matrix
similarities = -np.sum((self.embeddings_scaled[:, np.newaxis] - self.embeddings_scaled)**2, axis=2)
# Try different preference values (percentiles of similarity matrix diagonal)
preference_candidates = [
np.percentile(similarities.diagonal(), 10),
np.percentile(similarities.diagonal(), 25),
np.percentile(similarities.diagonal(), 50),
np.median(similarities),
np.percentile(similarities.diagonal(), 75)
]
damping_candidates = [0.5, 0.7, 0.8, 0.9]
best_score = -1
best_params = None
best_labels = None
print("Testing different parameter combinations...")
for preference in preference_candidates:
for damping in damping_candidates:
try:
affinity_prop = AffinityPropagation(
preference=preference,
damping=damping,
random_state=42,
max_iter=200
)
labels = affinity_prop.fit_predict(self.embeddings_scaled)
n_clusters = len(set(labels))
if 2 <= n_clusters <= len(self.embeddings_scaled) // 3:
score = silhouette_score(self.embeddings_scaled, labels)
print(f"preference={preference:.2f}, damping={damping:.1f}: {n_clusters} clusters, silhouette={score:.4f}")
if score > best_score:
best_score = score
best_params = (preference, damping)
best_labels = labels
except Exception as e:
print(f"preference={preference:.2f}, damping={damping:.1f}: failed ({str(e)[:30]}...)")
continue
if best_labels is not None:
n_clusters = len(set(best_labels))
print(f"\nBest Affinity Propagation result:")
print(f"Parameters: preference={best_params[0]:.2f}, damping={best_params[1]:.1f}")
print(f"Number of clusters: {n_clusters}")
print(f"Silhouette score: {best_score:.4f}")
return best_labels
else:
print("Affinity Propagation could not find suitable clusters")
return None
def visualize_results(self, results_dict):
"""Visualize clustering results using PCA"""
if not results_dict:
print("No results to visualize")
return
# Reduce dimensions for visualization
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(self.embeddings_scaled)
n_methods = len(results_dict)
fig, axes = plt.subplots(1, n_methods, figsize=(5*n_methods, 4))
if n_methods == 1:
axes = [axes]
for idx, (method_name, labels) in enumerate(results_dict.items()):
# Handle noise points in DBSCAN (label -1)
unique_labels = set(labels)
colors = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
for label, color in zip(unique_labels, colors):
if label == -1:
# Noise points in black
mask = labels == label
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
c='black', marker='x', s=20, alpha=0.5, label='Noise')
else:
mask = labels == label
axes[idx].scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
axes[idx].set_title(f'{method_name}\n({len(set(labels)) - (1 if -1 in labels else 0)} clusters)')
axes[idx].set_xlabel('PCA Component 1')
axes[idx].set_ylabel('PCA Component 2')
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('auto_clustering_results.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"\nVisualization saved as 'auto_clustering_results.png'")
def save_results(self, results_dict):
"""Save clustering results to JSON files"""
print(results_dict.items())
check_method_name = []
print(len(results_dict))
for method_name, labels in results_dict.items():
check_method_name.append(method_name)
# Create results for each method
method_results = []
print(method_name == 'DBSCAN')
for filepath, label in zip(self.file_paths, labels):
if method_name == 'DBSCAN':
if label == -1:
is_noise = True
else:
is_noise = False
else:
is_noise = False
method_results.append({
"filepath": filepath,
"cluster": int(label),
"is_noise": is_noise
})
print('method_name', set(check_method_name))
print(method_results[0]['is_noise'])
print(method_results[0])
# Save to file
filename = f"{method_name.lower().replace(' ', '_')}_results.json"
with open(filename, 'w') as f:
json.dump({
"method": method_name,
"n_clusters": len(set(labels)) - (1 if -1 in labels else 0),
"n_samples": len(labels),
"results": method_results
}, f, indent=4)
print(f"Results saved to {filename}")
def run_all_methods(self):
"""Run all automatic clustering methods"""
print("\n" + "="*70)
print("AUTOMATIC CLUSTERING ANALYSIS")
print("="*70)
print(f"Dataset: {len(self.file_paths)} documents")
print(f"Embedding dimension: {self.embeddings.shape[1]}")
results = {}
# Run DBSCAN
dbscan_labels = self.run_dbscan()
if dbscan_labels is not None:
results["DBSCAN"] = dbscan_labels
# Run Mean Shift
# meanshift_labels = self.run_mean_shift()
# if meanshift_labels is not None:
# results["Mean Shift"] = meanshift_labels
# Run Affinity Propagation
# affinity_labels = self.run_affinity_propagation()
# if affinity_labels is not None:
# results["Affinity Propagation"] = affinity_labels
# Summary
if results:
print("\n" + "="*70)
print("SUMMARY OF RESULTS")
print("="*70)
for method, labels in results.items():
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
if method == "DBSCAN":
n_noise = list(labels).count(-1)
print(f"{method}: {n_clusters} clusters, {n_noise} noise points")
else:
print(f"{method}: {n_clusters} clusters")
# Calculate agreement between methods if multiple succeeded
if len(results) > 1:
from sklearn.metrics import adjusted_rand_score
print("\nMethod Agreement (Adjusted Rand Index):")
method_names = list(results.keys())
for i in range(len(method_names)):
for j in range(i+1, len(method_names)):
ari = adjusted_rand_score(results[method_names[i]], results[method_names[j]])
print(f"{method_names[i]} vs {method_names[j]}: {ari:.4f}")
# Visualize and save results
self.visualize_results(results)
self.save_results(results)
else:
print("\nNo automatic clustering method found suitable clusters.")
print("This might indicate:")
print("- Data doesn't have clear cluster structure")
print("- Embeddings need different preprocessing")
print("- Different parameter ranges needed")
return results
def main():
parser = argparse.ArgumentParser(description="Run automatic clustering methods on document embeddings")
parser.add_argument("--embeddings_path", help="Path to embeddings JSON file")
parser.add_argument("--method", choices=['dbscan', 'meanshift', 'affinity', 'all'], default='all',
help="Which automatic method to run")
args = parser.parse_args()
# Initialize clustering
clustering = AutoClustering(args.embeddings_path)
# Run selected method(s)
if args.method == 'all':
clustering.run_all_methods()
elif args.method == 'dbscan':
labels = clustering.run_dbscan()
if labels is not None:
clustering.visualize_results({"DBSCAN": labels})
clustering.save_results({"DBSCAN": labels})
elif args.method == 'meanshift':
labels = clustering.run_mean_shift()
if labels is not None:
clustering.visualize_results({"Mean Shift": labels})
clustering.save_results({"Mean Shift": labels})
elif args.method == 'affinity':
labels = clustering.run_affinity_propagation()
if labels is not None:
clustering.visualize_results({"Affinity Propagation": labels})
clustering.save_results({"Affinity Propagation": labels})
if __name__ == "__main__":
main()