embedding-clustering/filter/dbscan_v3.py

#!/usr/bin/env python3
"""
DBSCAN Clustering Filter

Filters clustering results based on specific criteria, parameterized via CLI:
- For each cluster: select a configurable ratio of points (selection_ratio)
    - A configurable portion from center region (center_ratio)
    - A configurable portion from border region (border_ratio)
- All noise points are selected
- Uses cosine distance metric

CLI parameters added:
    --selection_ratio  (float, default 0.5)
    --center_ratio     (float, default 0.5)
    --border_ratio     (float, default 0.5)

Example:
    python dbscan_v3.py \
            --embeddings_path embeddings.json \
            --clustering_results_path dbscan_results.json \
            --selection_ratio 0.4 --center_ratio 0.6 --border_ratio 0.4
"""

import json
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances
import argparse
import os
from pathlib import Path
import random


class DBSCANFilter:
    def __init__(self, embeddings_path, clustering_results_path,
                 selection_ratio=0.5, center_ratio=0.5, border_ratio=0.5):
        """Initialize DBSCAN filter

        Args:
            embeddings_path: Path to embeddings JSON file
            clustering_results_path: Path to DBSCAN clustering results JSON
            selection_ratio: Ratio of total cluster points to consider selecting
            center_ratio: Ratio applied within center region (relative scaling)
            border_ratio: Ratio applied within border region (relative scaling)
        """
        self.embeddings_path = embeddings_path
        self.clustering_results_path = clustering_results_path
        self.embeddings = None
        self.embeddings_normalized = None
        self.clustering_results = None
        self.filepath_to_embedding = {}
        self.selection_ratio = selection_ratio
        self.center_ratio = center_ratio
        self.border_ratio = border_ratio
        
    def load_data(self):
        """Load embeddings and clustering results"""
        print("Loading embeddings...")
        with open(self.embeddings_path, 'r') as f:
            embeddings_data = json.load(f)
        
        # Create mapping from filepath to embedding
        embeddings_list = []
        filepaths = []
        for item in embeddings_data:
            self.filepath_to_embedding[item['filepath']] = item['embedding']
            embeddings_list.append(item['embedding'])
            filepaths.append(item['filepath'])
        
        self.embeddings = np.array(embeddings_list, dtype=np.float32)
        self.embeddings_normalized = normalize(self.embeddings, norm='l2')
        print(f"Loaded {len(embeddings_list)} embeddings")
        
        print("Loading clustering results...")
        with open(self.clustering_results_path, 'r') as f:
            self.clustering_results = json.load(f)
        
        print(f"Loaded clustering results: {self.clustering_results['n_clusters']} clusters, "
              f"{self.clustering_results['n_samples']} samples")
    
    def group_by_clusters(self):
        """Group data points by cluster labels"""
        clusters = {}
        noise_points = []
        
        for result in self.clustering_results['results']:
            cluster_id = result['cluster']
            filepath = result['filepath']
            if 'is_noise' in result:
                is_noise = result['is_noise']
            else:
                is_noise = False
            
            if is_noise or cluster_id == -1:
                noise_points.append({
                    'filepath': filepath,
                    'embedding': self.filepath_to_embedding[filepath]
                })
            else:
                if cluster_id not in clusters:
                    clusters[cluster_id] = []
                clusters[cluster_id].append({
                    'filepath': filepath,
                    'embedding': self.filepath_to_embedding[filepath]
                })
        
        return clusters, noise_points
    
    def calculate_cluster_centroid(self, cluster_points):
        """Calculate centroid of a cluster using normalized embeddings"""
        embeddings = np.array([point['embedding'] for point in cluster_points])
        embeddings_normalized = normalize(embeddings, norm='l2')
        
        # For cosine distance, centroid is the normalized mean
        centroid = np.mean(embeddings_normalized, axis=0)
        centroid_normalized = normalize(centroid.reshape(1, -1), norm='l2')[0]
        
        return centroid_normalized
    
    def calculate_cosine_distances_to_centroid(self, cluster_points, centroid):
        """Calculate cosine distances from each point to cluster centroid"""
        embeddings = np.array([point['embedding'] for point in cluster_points])
        embeddings_normalized = normalize(embeddings, norm='l2')
        
        # Calculate cosine distances to centroid
        distances = cosine_distances(embeddings_normalized, centroid.reshape(1, -1)).flatten()
        
        return distances

    def filter_cluster(self, cluster_points):
        """Lọc điểm trong một cluster dựa trên các tham số đã cấu hình."""
        if not cluster_points:
            return []

        selection_ratio = self.selection_ratio
        center_ratio = self.center_ratio
        border_ratio = self.border_ratio

        total_points = len(cluster_points)
        num_to_select = max(15, int(total_points * selection_ratio))
        if num_to_select >= total_points and selection_ratio != 1:
            return cluster_points

        centroid = self.calculate_cluster_centroid(cluster_points)
        distances = self.calculate_cosine_distances_to_centroid(cluster_points, centroid)
        point_distance_pairs = list(zip(cluster_points, distances))
        point_distance_pairs.sort(key=lambda x: x[1])

        dis = 0.1  # ngưỡng khoảng cách để phân loại center / border
        all_center_points = [p for p, d in point_distance_pairs if d < dis]
        all_border_points = [p for p, d in point_distance_pairs if d >= dis]

        print(f"Number of center points (distance < {dis}): {len(all_center_points)}")
        print(f"Number of border points (distance >= {dis}): {len(all_border_points)}")

        n_center = len(all_center_points)
        n_border = len(all_border_points)

        if n_center > 0:
            center_count = max(1, int(n_center * center_ratio * selection_ratio))
            center_count = min(center_count, n_center)
        else:
            center_count = 0

        if n_border < 70:
            border_count = n_border
        else:
            border_count = max(0, int(n_border * border_ratio * selection_ratio))
            border_count = min(border_count, n_border)

        random.seed(42)
        selected_points = []
        if center_count > 0:
            selected_points.extend(random.sample(all_center_points, center_count))
        if border_count > 0:
            selected_points.extend(random.sample(all_border_points, border_count))

        print(
            f"Cluster with {total_points} points -> selected {len(selected_points)} points "
            f"({center_count} center + {border_count} border)"
        )
        return selected_points
    
    def filter_all_clusters(self):
        """Filter all clusters according to the specified criteria"""
        print("\n" + "="*60)
        print("FILTERING DBSCAN CLUSTERING RESULTS")
        print("="*60)
        
        clusters, noise_points = self.group_by_clusters()
        
        print(f"Found {len(clusters)} clusters and {len(noise_points)} noise points")
        
        filtered_results = []
        
        # Process each cluster
        for cluster_id, cluster_points in clusters.items():
            print(f"\nProcessing Cluster {cluster_id}:")
            filtered_points = self.filter_cluster(cluster_points)
            
            # Add cluster information
            for point in filtered_points:
                filtered_results.append({
                    'filepath': point['filepath'],
                    'cluster': cluster_id,
                    'is_noise': False,
                    'selection_type': 'cluster_filtered'
                })
        
        # Add all noise points
        print(f"\nAdding all {len(noise_points)} noise points...")
        n_noise = len(noise_points)
        noise_count = max(0, int(n_noise * self.selection_ratio))
        random.seed(42)
        selected_noise_points = random.sample(noise_points, noise_count)
        
        for point in selected_noise_points:
            filtered_results.append({
                'filepath': point['filepath'],
                'cluster': -1,
                'is_noise': True,
                'selection_type': 'noise'
            })
        
        return filtered_results
    
    def save_filtered_results(self, filtered_results, output_path=None):
        """Save filtered results to JSON file"""
        if output_path is None:
            # Generate output filename based on input
            base_name = Path(self.clustering_results_path).stem
            output_path = f"{base_name}_filtered.json"
        
        # Create summary statistics
        cluster_stats = {}
        noise_count = 0
        
        for result in filtered_results:
            if result['is_noise']:
                noise_count += 1
            else:
                cluster_id = result['cluster']
                if cluster_id not in cluster_stats:
                    cluster_stats[cluster_id] = 0
                cluster_stats[cluster_id] += 1
        
        # Prepare output data
        output_data = {
            "method": "DBSCAN_FILTERED",
            "original_n_clusters": self.clustering_results['n_clusters'],
            "original_n_samples": self.clustering_results['n_samples'],
            "filtered_n_samples": len(filtered_results),
            "filtering_criteria": {
                "cluster_selection_ratio": self.selection_ratio,
                "center_points_ratio": self.center_ratio,
                "border_points_ratio": self.border_ratio,
                "noise_points": "all_selected"
            },
            "cluster_statistics": cluster_stats,
            "noise_points": noise_count,
            "results": filtered_results
        }
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=4, ensure_ascii=False)
        
        print("\n" + "="*60)
        print("FILTERING SUMMARY")
        print("="*60)
        print(f"Original samples: {self.clustering_results['n_samples']}")
        print(f"Filtered samples: {len(filtered_results)}")
        print(f"Reduction ratio: {len(filtered_results)/self.clustering_results['n_samples']:.2%}")
        print("\nCluster breakdown:")
        for cluster_id, count in sorted(cluster_stats.items()):
            print(f"  Cluster {cluster_id}: {count} points")
        print(f"  Noise points: {noise_count} points")
        print(f"\nFiltered results saved to: {output_path}")
        
        return output_path
    
    def create_filepath_list(self, filtered_results, output_txt_path=None):
        """Create a simple text file with filtered filepaths"""
        if output_txt_path is None:
            base_name = Path(self.clustering_results_path).stem
            output_txt_path = f"{base_name}_filtered_filepaths.txt"
        
        filepaths = [result['filepath'] for result in filtered_results]
        
        with open(output_txt_path, 'w', encoding='utf-8') as f:
            for filepath in filepaths:
                f.write(f"{filepath}\n")
        
        print(f"Filepath list saved to: {output_txt_path}")
        return output_txt_path


def main():
    parser = argparse.ArgumentParser(description="Filter DBSCAN clustering results")
    parser.add_argument("--embeddings_path", required=True,
                        help="Path to embeddings JSON file")
    parser.add_argument("--clustering_results_path", required=True,
                        help="Path to DBSCAN clustering results JSON file")
    parser.add_argument("--output_path", 
                        help="Output path for filtered results (optional)")
    parser.add_argument("--create_filepath_list", action="store_true",
                        help="Also create a simple text file with filtered filepaths")
    parser.add_argument("--selection_ratio", type=float, default=0.5,
                        help="Overall ratio of points to sample per cluster (default: 0.5). Minimum 15 points enforced.")
    parser.add_argument("--center_ratio", type=float, default=0.5,
                        help="Relative ratio applied to center region when sampling (default: 0.5)")
    parser.add_argument("--border_ratio", type=float, default=0.5,
                        help="Relative ratio applied to border region when sampling (default: 0.5)")
    
    args = parser.parse_args()
    
    # Validate input files exist
    if not os.path.exists(args.embeddings_path):
        print(f"Error: Embeddings file not found: {args.embeddings_path}")
        return
    
    if not os.path.exists(args.clustering_results_path):
        print(f"Error: Clustering results file not found: {args.clustering_results_path}")
        return
    
    # Initialize filter
    # Initialize filter with user-provided ratios
    filter_obj = DBSCANFilter(
        args.embeddings_path,
        args.clustering_results_path,
        selection_ratio=args.selection_ratio,
        center_ratio=args.center_ratio,
        border_ratio=args.border_ratio
    )
    
    # Load data
    filter_obj.load_data()
    
    # Filter clusters
    filtered_results = filter_obj.filter_all_clusters()
    
    # Save results
    filter_obj.save_filtered_results(filtered_results, args.output_path)
    
    # Create filepath list if requested
    if args.create_filepath_list:
        filter_obj.create_filepath_list(filtered_results)
    
    print("\nFiltering completed successfully!")


if __name__ == "__main__":
    main()
update source code and pipeline 2025-09-04 14:39:02 +00:00			`#!/usr/bin/env python3`
			`"""`
			`DBSCAN Clustering Filter`

			`Filters clustering results based on specific criteria, parameterized via CLI:`
			`- For each cluster: select a configurable ratio of points (selection_ratio)`
			`- A configurable portion from center region (center_ratio)`
			`- A configurable portion from border region (border_ratio)`
			`- All noise points are selected`
			`- Uses cosine distance metric`

			`CLI parameters added:`
			`--selection_ratio (float, default 0.5)`
			`--center_ratio (float, default 0.5)`
			`--border_ratio (float, default 0.5)`

			`Example:`
			`python dbscan_v3.py \`
			`--embeddings_path embeddings.json \`
			`--clustering_results_path dbscan_results.json \`
			`--selection_ratio 0.4 --center_ratio 0.6 --border_ratio 0.4`
			`"""`

			`import json`
			`import numpy as np`
			`from sklearn.preprocessing import normalize`
			`from sklearn.metrics.pairwise import cosine_distances`
			`import argparse`
			`import os`
			`from pathlib import Path`
			`import random`


			`class DBSCANFilter:`
			`def __init__(self, embeddings_path, clustering_results_path,`
			`selection_ratio=0.5, center_ratio=0.5, border_ratio=0.5):`
			`"""Initialize DBSCAN filter`

			`Args:`
			`embeddings_path: Path to embeddings JSON file`
			`clustering_results_path: Path to DBSCAN clustering results JSON`
			`selection_ratio: Ratio of total cluster points to consider selecting`
			`center_ratio: Ratio applied within center region (relative scaling)`
			`border_ratio: Ratio applied within border region (relative scaling)`
			`"""`
			`self.embeddings_path = embeddings_path`
			`self.clustering_results_path = clustering_results_path`
			`self.embeddings = None`
			`self.embeddings_normalized = None`
			`self.clustering_results = None`
			`self.filepath_to_embedding = {}`
			`self.selection_ratio = selection_ratio`
			`self.center_ratio = center_ratio`
			`self.border_ratio = border_ratio`

			`def load_data(self):`
			`"""Load embeddings and clustering results"""`
			`print("Loading embeddings...")`
			`with open(self.embeddings_path, 'r') as f:`
			`embeddings_data = json.load(f)`

			`# Create mapping from filepath to embedding`
			`embeddings_list = []`
			`filepaths = []`
			`for item in embeddings_data:`
			`self.filepath_to_embedding[item['filepath']] = item['embedding']`
			`embeddings_list.append(item['embedding'])`
			`filepaths.append(item['filepath'])`

			`self.embeddings = np.array(embeddings_list, dtype=np.float32)`
			`self.embeddings_normalized = normalize(self.embeddings, norm='l2')`
			`print(f"Loaded {len(embeddings_list)} embeddings")`

			`print("Loading clustering results...")`
			`with open(self.clustering_results_path, 'r') as f:`
			`self.clustering_results = json.load(f)`

			`print(f"Loaded clustering results: {self.clustering_results['n_clusters']} clusters, "`
			`f"{self.clustering_results['n_samples']} samples")`

			`def group_by_clusters(self):`
			`"""Group data points by cluster labels"""`
			`clusters = {}`
			`noise_points = []`

			`for result in self.clustering_results['results']:`
			`cluster_id = result['cluster']`
			`filepath = result['filepath']`
			`if 'is_noise' in result:`
			`is_noise = result['is_noise']`
			`else:`
			`is_noise = False`

			`if is_noise or cluster_id == -1:`
			`noise_points.append({`
			`'filepath': filepath,`
			`'embedding': self.filepath_to_embedding[filepath]`
			`})`
			`else:`
			`if cluster_id not in clusters:`
			`clusters[cluster_id] = []`
			`clusters[cluster_id].append({`
			`'filepath': filepath,`
			`'embedding': self.filepath_to_embedding[filepath]`
			`})`

			`return clusters, noise_points`

			`def calculate_cluster_centroid(self, cluster_points):`
			`"""Calculate centroid of a cluster using normalized embeddings"""`
			`embeddings = np.array([point['embedding'] for point in cluster_points])`
			`embeddings_normalized = normalize(embeddings, norm='l2')`

			`# For cosine distance, centroid is the normalized mean`
			`centroid = np.mean(embeddings_normalized, axis=0)`
			`centroid_normalized = normalize(centroid.reshape(1, -1), norm='l2')[0]`

			`return centroid_normalized`

			`def calculate_cosine_distances_to_centroid(self, cluster_points, centroid):`
			`"""Calculate cosine distances from each point to cluster centroid"""`
			`embeddings = np.array([point['embedding'] for point in cluster_points])`
			`embeddings_normalized = normalize(embeddings, norm='l2')`

			`# Calculate cosine distances to centroid`
			`distances = cosine_distances(embeddings_normalized, centroid.reshape(1, -1)).flatten()`

			`return distances`

			`def filter_cluster(self, cluster_points):`
			`"""Lọc điểm trong một cluster dựa trên các tham số đã cấu hình."""`
			`if not cluster_points:`
			`return []`

			`selection_ratio = self.selection_ratio`
			`center_ratio = self.center_ratio`
			`border_ratio = self.border_ratio`

			`total_points = len(cluster_points)`
			`num_to_select = max(15, int(total_points * selection_ratio))`
			`if num_to_select >= total_points and selection_ratio != 1:`
			`return cluster_points`

			`centroid = self.calculate_cluster_centroid(cluster_points)`
			`distances = self.calculate_cosine_distances_to_centroid(cluster_points, centroid)`
			`point_distance_pairs = list(zip(cluster_points, distances))`
			`point_distance_pairs.sort(key=lambda x: x[1])`

			`dis = 0.1 # ngưỡng khoảng cách để phân loại center / border`
			`all_center_points = [p for p, d in point_distance_pairs if d < dis]`
			`all_border_points = [p for p, d in point_distance_pairs if d >= dis]`

			`print(f"Number of center points (distance < {dis}): {len(all_center_points)}")`
			`print(f"Number of border points (distance >= {dis}): {len(all_border_points)}")`

			`n_center = len(all_center_points)`
			`n_border = len(all_border_points)`

			`if n_center > 0:`
			`center_count = max(1, int(n_center * center_ratio * selection_ratio))`
			`center_count = min(center_count, n_center)`
			`else:`
			`center_count = 0`

			`if n_border < 70:`
			`border_count = n_border`
			`else:`
			`border_count = max(0, int(n_border * border_ratio * selection_ratio))`
			`border_count = min(border_count, n_border)`

			`random.seed(42)`
			`selected_points = []`
			`if center_count > 0:`
			`selected_points.extend(random.sample(all_center_points, center_count))`
			`if border_count > 0:`
			`selected_points.extend(random.sample(all_border_points, border_count))`

			`print(`
			`f"Cluster with {total_points} points -> selected {len(selected_points)} points "`
			`f"({center_count} center + {border_count} border)"`
			`)`
			`return selected_points`

			`def filter_all_clusters(self):`
			`"""Filter all clusters according to the specified criteria"""`
			`print("\n" + "="*60)`
			`print("FILTERING DBSCAN CLUSTERING RESULTS")`
			`print("="*60)`

			`clusters, noise_points = self.group_by_clusters()`

			`print(f"Found {len(clusters)} clusters and {len(noise_points)} noise points")`

			`filtered_results = []`

			`# Process each cluster`
			`for cluster_id, cluster_points in clusters.items():`
			`print(f"\nProcessing Cluster {cluster_id}:")`
			`filtered_points = self.filter_cluster(cluster_points)`

			`# Add cluster information`
			`for point in filtered_points:`
			`filtered_results.append({`
			`'filepath': point['filepath'],`
			`'cluster': cluster_id,`
			`'is_noise': False,`
			`'selection_type': 'cluster_filtered'`
			`})`

			`# Add all noise points`
			`print(f"\nAdding all {len(noise_points)} noise points...")`
			`n_noise = len(noise_points)`
			`noise_count = max(0, int(n_noise * self.selection_ratio))`
			`random.seed(42)`
			`selected_noise_points = random.sample(noise_points, noise_count)`

			`for point in selected_noise_points:`
			`filtered_results.append({`
			`'filepath': point['filepath'],`
			`'cluster': -1,`
			`'is_noise': True,`
			`'selection_type': 'noise'`
			`})`

			`return filtered_results`

			`def save_filtered_results(self, filtered_results, output_path=None):`
			`"""Save filtered results to JSON file"""`
			`if output_path is None:`
			`# Generate output filename based on input`
			`base_name = Path(self.clustering_results_path).stem`
			`output_path = f"{base_name}_filtered.json"`

			`# Create summary statistics`
			`cluster_stats = {}`
			`noise_count = 0`

			`for result in filtered_results:`
			`if result['is_noise']:`
			`noise_count += 1`
			`else:`
			`cluster_id = result['cluster']`
			`if cluster_id not in cluster_stats:`
			`cluster_stats[cluster_id] = 0`
			`cluster_stats[cluster_id] += 1`

			`# Prepare output data`
			`output_data = {`
			`"method": "DBSCAN_FILTERED",`
			`"original_n_clusters": self.clustering_results['n_clusters'],`
			`"original_n_samples": self.clustering_results['n_samples'],`
			`"filtered_n_samples": len(filtered_results),`
			`"filtering_criteria": {`
			`"cluster_selection_ratio": self.selection_ratio,`
			`"center_points_ratio": self.center_ratio,`
			`"border_points_ratio": self.border_ratio,`
			`"noise_points": "all_selected"`
			`},`
			`"cluster_statistics": cluster_stats,`
			`"noise_points": noise_count,`
			`"results": filtered_results`
			`}`

			`with open(output_path, 'w', encoding='utf-8') as f:`
			`json.dump(output_data, f, indent=4, ensure_ascii=False)`

			`print("\n" + "="*60)`
			`print("FILTERING SUMMARY")`
			`print("="*60)`
			`print(f"Original samples: {self.clustering_results['n_samples']}")`
			`print(f"Filtered samples: {len(filtered_results)}")`
			`print(f"Reduction ratio: {len(filtered_results)/self.clustering_results['n_samples']:.2%}")`
			`print("\nCluster breakdown:")`
			`for cluster_id, count in sorted(cluster_stats.items()):`
			`print(f" Cluster {cluster_id}: {count} points")`
			`print(f" Noise points: {noise_count} points")`
			`print(f"\nFiltered results saved to: {output_path}")`

			`return output_path`

			`def create_filepath_list(self, filtered_results, output_txt_path=None):`
			`"""Create a simple text file with filtered filepaths"""`
			`if output_txt_path is None:`
			`base_name = Path(self.clustering_results_path).stem`
			`output_txt_path = f"{base_name}_filtered_filepaths.txt"`

			`filepaths = [result['filepath'] for result in filtered_results]`

			`with open(output_txt_path, 'w', encoding='utf-8') as f:`
			`for filepath in filepaths:`
			`f.write(f"{filepath}\n")`

			`print(f"Filepath list saved to: {output_txt_path}")`
			`return output_txt_path`


			`def main():`
			`parser = argparse.ArgumentParser(description="Filter DBSCAN clustering results")`
			`parser.add_argument("--embeddings_path", required=True,`
			`help="Path to embeddings JSON file")`
			`parser.add_argument("--clustering_results_path", required=True,`
			`help="Path to DBSCAN clustering results JSON file")`
			`parser.add_argument("--output_path",`
			`help="Output path for filtered results (optional)")`
			`parser.add_argument("--create_filepath_list", action="store_true",`
			`help="Also create a simple text file with filtered filepaths")`
			`parser.add_argument("--selection_ratio", type=float, default=0.5,`
			`help="Overall ratio of points to sample per cluster (default: 0.5). Minimum 15 points enforced.")`
			`parser.add_argument("--center_ratio", type=float, default=0.5,`
			`help="Relative ratio applied to center region when sampling (default: 0.5)")`
			`parser.add_argument("--border_ratio", type=float, default=0.5,`
			`help="Relative ratio applied to border region when sampling (default: 0.5)")`

			`args = parser.parse_args()`

			`# Validate input files exist`
			`if not os.path.exists(args.embeddings_path):`
			`print(f"Error: Embeddings file not found: {args.embeddings_path}")`
			`return`

			`if not os.path.exists(args.clustering_results_path):`
			`print(f"Error: Clustering results file not found: {args.clustering_results_path}")`
			`return`

			`# Initialize filter`
			`# Initialize filter with user-provided ratios`
			`filter_obj = DBSCANFilter(`
			`args.embeddings_path,`
			`args.clustering_results_path,`
			`selection_ratio=args.selection_ratio,`
			`center_ratio=args.center_ratio,`
			`border_ratio=args.border_ratio`
			`)`

			`# Load data`
			`filter_obj.load_data()`

			`# Filter clusters`
			`filtered_results = filter_obj.filter_all_clusters()`

			`# Save results`
			`filter_obj.save_filtered_results(filtered_results, args.output_path)`

			`# Create filepath list if requested`
			`if args.create_filepath_list:`
			`filter_obj.create_filepath_list(filtered_results)`

			`print("\nFiltering completed successfully!")`


			`if __name__ == "__main__":`
			`main()`