Skip to content

Commit

Permalink
perf: clean vss images with cleanvision defaults removing all dark an…
Browse files Browse the repository at this point in the history
…d blurry images
  • Loading branch information
danellecline committed Oct 28, 2024
1 parent b93ebb1 commit 1ccb994
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 27 deletions.
40 changes: 16 additions & 24 deletions aipipeline/prediction/library.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import glob
from cleanvision import Imagelab
import json
import logging
import multiprocessing
Expand Down Expand Up @@ -77,35 +77,27 @@ def generate_multicrop_views2(image) -> List[tuple]:
return data


def clean_blurriness_single(element, min_variance) -> tuple:
def clean_bad_images(element) -> tuple:
count, crop_path, save_path = element
num_removed = 0
for image_path in glob.glob(f"{crop_path}/*.jpg"):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
variance = laplacian.var()
if variance < min_variance:
logger.info(f"Removing {image_path} with variance {variance}")
os.remove(image_path)
num_removed += 1
imagelab = Imagelab(data_path=crop_path)
imagelab.find_issues()
imagelab.report()
# Get all dark or blurry images and remove them
bad_images = imagelab.issues[["is_dark_issue"] == True or imagelab.issues["is_dark_issue"] == True]
for img in bad_images.index:
os.remove(img)
num_removed += 1
logger.info(f"Removed {num_removed} dark or blurry images in {crop_path}")
return count - num_removed, crop_path, save_path


def clean_blurriness(elements, min_variance) -> List[tuple]:
logger.info(f"Cleaning blurriness in {elements} ")
data = []
import multiprocessing
num_cpus = multiprocessing.cpu_count()
if len(elements) > num_cpus:
num_processes = num_cpus
else:
num_processes = len(elements)
with multiprocessing.Pool(num_processes) as pool:
args = [(data, min_variance) for data in elements]
cleaned_data = pool.starmap(clean_blurriness_single, args)
def clean_images(elements) -> List[tuple]:
logger.info(f"Cleaning bad images in {elements} ")
for element in elements:
clean_bad_images(element)

return cleaned_data
return elements


def generate_multicrop_views(elements) -> List[tuple]:
Expand Down
4 changes: 2 additions & 2 deletions aipipeline/prediction/vss_init_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
gen_machine_friendly_label,
clean,
batch_elements,
ProcessClusterBatch, remove_multicrop_views, clean_blurriness,
ProcessClusterBatch, remove_multicrop_views, clean_images,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -173,7 +173,7 @@ def run_pipeline(argv=None):
start
| "Crop ROI" >> beam.Map(crop_rois_voc, config_dict=config_dict)
| "Generate views" >> beam.Map(generate_multicrop_views)
| 'Remove blurred images' >> beam.Map(clean_blurriness, min_variance=args.min_variance)
| "Clean dark blurry examples" >> beam.Map(clean_images)
| 'Batch cluster ROI elements' >> beam.FlatMap(lambda x: batch_elements(x, batch_size=2))
| 'Process cluster ROI batches' >> beam.ParDo(ProcessClusterBatch(config_dict=config_dict))
| "Load exemplars" >> beam.Map(load_exemplars, config_dict=config_dict, conf_files=conf_files)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ scikit-learn
matplotlib
albumentations
torch
apache-beam
apache-beam
cleanvision

0 comments on commit 1ccb994

Please sign in to comment.