Files
embedding-clustering/filter/run_filter.sh

139 lines
6.0 KiB
Bash

#!/bin/bash
# Example script to run DBSCAN filtering
# Make sure to update the paths according to your data
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
# OUTPUT_PATH="dbscan_filtered_results.json" #0.5 of data, center 0.5, 0.5 border
# OUTPUT_PATH="dbscan_filtered_results_v2.json" #0.5 of data, 0.25 center, 0.75 border
# OUTPUT_PATH="dbscan_filtered_results_v3.json" #0.75 of data, center 0.25 border 0.75
# echo "Running DBSCAN filtering..."
# echo "Embeddings: $EMBEDDINGS_PATH"
# echo "Clustering results: $CLUSTERING_RESULTS_PATH"
# echo "Output: $OUTPUT_PATH"
# python dbscan.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list
# echo "Filtering completed!"
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_final_results_bic_20250805_150636.json"
# OUTPUT_PATH="gmm_best_by_BIC_filtered_results.json" #0.75 of data, center 0.25 border 0.75
# echo "Running DBSCAN filtering..."
# echo "Embeddings: $EMBEDDINGS_PATH"
# echo "Clustering results: $CLUSTERING_RESULTS_PATH"
# echo "Output: $OUTPUT_PATH"
# python dbscan.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list
# echo "Filtering completed!"
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
# OUTPUT_PATH="014_re_run_dbscan_filtered_results_temp.json" #0.75 of data, center 0.25 border 0.75
# python dbscan_v2.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_final_results_bic_20250805_150636.json"
# OUTPUT_PATH="015_gmm_best_by_BIC_filtered_results_temp.json" #0.75 of data, center 0.25 border 0.75
# python dbscan_v2.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
# OUTPUT_DIR="dbscan_v3_out_clusters"
# # python dbscan_v3_only_one_cluster.py \
# # --embeddings_path "$EMBEDDINGS_PATH" \
# # --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# # --output_dir "$OUTPUT_DIR" \
# python dbscan_only_one_cluster.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --save_full_clusters \
# --clusters_output_dir per_clusters
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
# OUTPUT_PATH="026_dbscan_v3_filtered_results_temp.json"
# python dbscan_v3.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list \
# --selection_ratio 1.0 --center_ratio 0.5 --border_ratio 0.5
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
# OUTPUT_PATH="027_dbscan_v3_filtered_results_temp.json"
# python dbscan_v3.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list \
# --selection_ratio 1.0 --center_ratio 0.25 --border_ratio 0.75
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
# OUTPUT_PATH="019_dbscan_v3_filtered_results_temp.json"
# python dbscan_v3.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list \
# --selection_ratio 0.12 --center_ratio 0.25 --border_ratio 0.75
# EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
# CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/gmm_final_results_bic_20250805_150636.json"
# OUTPUT_PATH="028_gmm_best_by_BIC_filtered_results.json" #0.75 of data, center 0.25 border 0.75
# python dbscan_v3.py \
# --embeddings_path "$EMBEDDINGS_PATH" \
# --clustering_results_path "$CLUSTERING_RESULTS_PATH" \
# --output_path "$OUTPUT_PATH" \
# --create_filepath_list \
# --selection_ratio 1.0 --center_ratio 0.25 --border_ratio 0.75
EMBEDDINGS_PATH="/home/nguyendc/sonnh/embedding-clustering/extract/embeddings_factures_osteopathie_1k_qwen.json"
CLUSTERING_RESULTS_PATH="/home/nguyendc/sonnh/embedding-clustering/cluster/dbscan_results.json"
OUTPUT_PATH="029_dbscan_v3_filtered_results_temp_30.json" #0.75 of data, center 0.25 border 0.75
python dbscan_v3.py \
--embeddings_path "$EMBEDDINGS_PATH" \
--clustering_results_path "$CLUSTERING_RESULTS_PATH" \
--output_path "$OUTPUT_PATH" \
--create_filepath_list \
--selection_ratio 0.6 --center_ratio 0.5 --border_ratio 0.5