Spaces:

SkalskiP
/

EfficientSAM

Running on CPU Upgrade

App Files Files Community

SkalskiP commited on Dec 10, 2023

Commit

fa98faf

•

1 Parent(s): 33e6030

Point prompt mode ready for review

Browse files

Files changed (3) hide show

app.py +186 -68
utils/draw.py +32 -0
utils/efficient_sam.py +33 -0

app.py CHANGED Viewed

@@ -7,7 +7,8 @@ import torch
 from PIL import Image
 from transformers import SamModel, SamProcessor
-from utils.efficient_sam import load, inference_with_box
 MARKDOWN = """
 # EfficientSAM sv. SAM
@@ -17,28 +18,74 @@ This is a demo for ⚔️ SAM Battlegrounds - a speed and accuracy comparison be
 [SAM](https://arxiv.org/abs/2304.02643).
 """
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
 SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
 EFFICIENT_SAM_MODEL = load(device=DEVICE)
 MASK_ANNOTATOR = sv.MaskAnnotator(
-    color=sv.Color.red(),
-    color_lookup=sv.ColorLookup.INDEX)
-BOX_ANNOTATOR = sv.BoundingBoxAnnotator(
-    color=sv.Color.red(),
     color_lookup=sv.ColorLookup.INDEX)
-def annotate_image(image: np.ndarray, detections: sv.Detections) -> np.ndarray:
     bgr_image = image[:, :, ::-1]
     annotated_bgr_image = MASK_ANNOTATOR.annotate(
         scene=bgr_image, detections=detections)
-    annotated_bgr_image = BOX_ANNOTATOR.annotate(
-        scene=annotated_bgr_image, detections=detections)
     return annotated_bgr_image[:, :, ::-1]
-def efficient_sam_inference(
     image: np.ndarray,
     x_min: int,
     y_min: int,
@@ -49,10 +96,17 @@ def efficient_sam_inference(
     mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
     mask = mask[np.newaxis, ...]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
-    return annotate_image(image=image, detections=detections)
-def sam_inference(
     image: np.ndarray,
     x_min: int,
     y_min: int,
@@ -76,10 +130,17 @@ def sam_inference(
     )[0][0][0].numpy()
     mask = mask[np.newaxis, ...]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
-    return annotate_image(image=image, detections=detections)
-def inference(
     image: np.ndarray,
     x_min: int,
     y_min: int,
@@ -87,8 +148,46 @@ def inference(
     y_max: int
 ) -> Tuple[np.ndarray, np.ndarray]:
     return (
-        efficient_sam_inference(image, x_min, y_min, x_max, y_max),
-        sam_inference(image, x_min, y_min, x_max, y_max)
     )
@@ -96,73 +195,92 @@ def clear(_: np.ndarray) -> Tuple[None, None]:
     return None, None
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Tab(label="Box prompt"):
         with gr.Row():
             with gr.Column():
-                input_image = gr.Image()
                 with gr.Accordion(label="Box", open=False):
                     with gr.Row():
-                        x_min_number = gr.Number(label="x_min")
-                        y_min_number = gr.Number(label="y_min")
-                        x_max_number = gr.Number(label="x_max")
-                        y_max_number = gr.Number(label="y_max")
-            efficient_sam_output_image = gr.Image(label="EfficientSAM")
-            sam_output_image = gr.Image(label="SAM")
         with gr.Row():
-            submit_button = gr.Button("Submit")
         gr.Examples(
-            fn=inference,
-            examples=[
-                [
-                    'https://media.roboflow.com/efficient-sam/beagle.jpeg',
-                    69,
-                    26,
-                    625,
-                    704
-                ],
-                [
-                    'https://media.roboflow.com/efficient-sam/corgi.jpg',
-                    801,
-                    510,
-                    1782,
-                    993
-                ],
-                [
-                    'https://media.roboflow.com/efficient-sam/horses.jpg',
-                    814,
-                    696,
-                    1523,
-                    1183
-                ],
-                [
-                    'https://media.roboflow.com/efficient-sam/bears.jpg',
-                    653,
-                    874,
-                    1173,
-                    1229
-                ]
-            ],
-            inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
-            outputs=[efficient_sam_output_image, sam_output_image],
         )
-    submit_button.click(
-        efficient_sam_inference,
-        inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
-        outputs=efficient_sam_output_image
     )
-    submit_button.click(
-        sam_inference,
-        inputs=[input_image, x_min_number, y_min_number, x_max_number, y_max_number],
-        outputs=sam_output_image
     )
-    input_image.change(
         clear,
-        inputs=input_image,
-        outputs=[efficient_sam_output_image, sam_output_image]
     )
 demo.launch(debug=False, show_error=True)

 from PIL import Image
 from transformers import SamModel, SamProcessor
+from utils.efficient_sam import load, inference_with_box, inference_with_point
+from utils.draw import draw_circle, calculate_dynamic_circle_radius
 MARKDOWN = """
 # EfficientSAM sv. SAM
 [SAM](https://arxiv.org/abs/2304.02643).
 """
+BOX_EXAMPLES = [
+    ['https://media.roboflow.com/efficient-sam/corgi.jpg', 801, 510, 1782, 993],
+    ['https://media.roboflow.com/efficient-sam/horses.jpg', 814, 696, 1523, 1183],
+    ['https://media.roboflow.com/efficient-sam/bears.jpg', 653, 874, 1173, 1229]
+]
+POINT_EXAMPLES = [
+    ['https://media.roboflow.com/efficient-sam/corgi.jpg', 1291, 751],
+    ['https://media.roboflow.com/efficient-sam/horses.jpg', 1168, 939],
+    ['https://media.roboflow.com/efficient-sam/bears.jpg', 913, 1051]
+]
+PROMPT_COLOR = sv.Color.from_hex("#D3D3D3")
+MASK_COLOR = sv.Color.from_hex("#FF0000")
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 SAM_MODEL = SamModel.from_pretrained("facebook/sam-vit-huge").to(DEVICE)
 SAM_PROCESSOR = SamProcessor.from_pretrained("facebook/sam-vit-huge")
 EFFICIENT_SAM_MODEL = load(device=DEVICE)
 MASK_ANNOTATOR = sv.MaskAnnotator(
+    color=MASK_COLOR,
     color_lookup=sv.ColorLookup.INDEX)
+def annotate_image_with_box_prompt_result(
+    image: np.ndarray,
+    detections: sv.Detections,
+    x_min: int,
+    y_min: int,
+    x_max: int,
+    y_max: int
+) -> np.ndarray:
+    h, w, _ = image.shape
     bgr_image = image[:, :, ::-1]
     annotated_bgr_image = MASK_ANNOTATOR.annotate(
         scene=bgr_image, detections=detections)
+    annotated_bgr_image = sv.draw_rectangle(
+        scene=annotated_bgr_image,
+        rect=sv.Rect(
+            x=x_min,
+            y=y_min,
+            width=int(x_max - x_min),
+            height=int(y_max - y_min),
+        ),
+        color=PROMPT_COLOR,
+        thickness=sv.calculate_dynamic_line_thickness(resolution_wh=(w, h))
+    )
     return annotated_bgr_image[:, :, ::-1]
+def annotate_image_with_point_prompt_result(
+    image: np.ndarray,
+    detections: sv.Detections,
+    x: int,
+    y: int
+) -> np.ndarray:
+    h, w, _ = image.shape
+    bgr_image = image[:, :, ::-1]
+    annotated_bgr_image = MASK_ANNOTATOR.annotate(
+        scene=bgr_image, detections=detections)
+    annotated_bgr_image = draw_circle(
+        scene=annotated_bgr_image,
+        center=sv.Point(x=x, y=y),
+        radius=calculate_dynamic_circle_radius(resolution_wh=(w, h)),
+        color=PROMPT_COLOR)
+    return annotated_bgr_image[:, :, ::-1]
+def efficient_sam_box_inference(
     image: np.ndarray,
     x_min: int,
     y_min: int,
     mask = inference_with_box(image, box, EFFICIENT_SAM_MODEL, DEVICE)
     mask = mask[np.newaxis, ...]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    return annotate_image_with_box_prompt_result(
+        image=image,
+        detections=detections,
+        x_max=x_max,
+        x_min=x_min,
+        y_max=y_max,
+        y_min=y_min
+    )
+def sam_box_inference(
     image: np.ndarray,
     x_min: int,
     y_min: int,
     )[0][0][0].numpy()
     mask = mask[np.newaxis, ...]
     detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    return annotate_image_with_box_prompt_result(
+        image=image,
+        detections=detections,
+        x_max=x_max,
+        x_min=x_min,
+        y_max=y_max,
+        y_min=y_min
+    )
+def box_inference(
     image: np.ndarray,
     x_min: int,
     y_min: int,
     y_max: int
 ) -> Tuple[np.ndarray, np.ndarray]:
     return (
+        efficient_sam_box_inference(image, x_min, y_min, x_max, y_max),
+        sam_box_inference(image, x_min, y_min, x_max, y_max)
+    )
+def efficient_sam_point_inference(image: np.ndarray, x: int, y: int) -> np.ndarray:
+    point = np.array([[x, y]])
+    mask = inference_with_point(image, point, EFFICIENT_SAM_MODEL, DEVICE)
+    mask = mask[np.newaxis, ...]
+    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    return annotate_image_with_point_prompt_result(
+        image=image, detections=detections, x=x, y=y)
+def sam_point_inference(image: np.ndarray, x: int, y: int) -> np.ndarray:
+    input_points = [[[x, y]]]
+    inputs = SAM_PROCESSOR(
+        Image.fromarray(image),
+        input_points=[input_points],
+        return_tensors="pt"
+    ).to(DEVICE)
+    with torch.no_grad():
+        outputs = SAM_MODEL(**inputs)
+    mask = SAM_PROCESSOR.image_processor.post_process_masks(
+        outputs.pred_masks.cpu(),
+        inputs["original_sizes"].cpu(),
+        inputs["reshaped_input_sizes"].cpu()
+    )[0][0][0].numpy()
+    mask = mask[np.newaxis, ...]
+    detections = sv.Detections(xyxy=sv.mask_to_xyxy(masks=mask), mask=mask)
+    return annotate_image_with_point_prompt_result(
+        image=image, detections=detections, x=x, y=y)
+def point_inference(image: np.ndarray, x: int, y: int) -> Tuple[np.ndarray, np.ndarray]:
+    return (
+        efficient_sam_point_inference(image, x, y),
+        sam_point_inference(image, x, y)
     )
     return None, None
+box_input_image = gr.Image()
+x_min_number = gr.Number(label="x_min")
+y_min_number = gr.Number(label="y_min")
+x_max_number = gr.Number(label="x_max")
+y_max_number = gr.Number(label="y_max")
+box_inputs = [box_input_image, x_min_number, y_min_number, x_max_number, y_max_number]
+point_input_image = gr.Image()
+x_number = gr.Number(label="x")
+y_number = gr.Number(label="y")
+point_inputs = [point_input_image, x_number, y_number]
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
     with gr.Tab(label="Box prompt"):
         with gr.Row():
             with gr.Column():
+                box_input_image.render()
                 with gr.Accordion(label="Box", open=False):
                     with gr.Row():
+                        x_min_number.render()
+                        y_min_number.render()
+                        x_max_number.render()
+                        y_max_number.render()
+            efficient_sam_box_output_image = gr.Image(label="EfficientSAM")
+            sam_box_output_image = gr.Image(label="SAM")
         with gr.Row():
+            submit_box_inference_button = gr.Button("Submit")
+        gr.Examples(
+            fn=box_inference,
+            examples=BOX_EXAMPLES,
+            inputs=box_inputs,
+            outputs=[efficient_sam_box_output_image, sam_box_output_image],
+        )
+    with gr.Tab(label="Point prompt"):
+        with gr.Row():
+            with gr.Column():
+                point_input_image.render()
+                with gr.Accordion(label="Point", open=False):
+                    with gr.Row():
+                        x_number.render()
+                        y_number.render()
+            efficient_sam_point_output_image = gr.Image(label="EfficientSAM")
+            sam_point_output_image = gr.Image(label="SAM")
+        with gr.Row():
+            submit_point_inference_button = gr.Button("Submit")
         gr.Examples(
+            fn=point_inference,
+            examples=POINT_EXAMPLES,
+            inputs=point_inputs,
+            outputs=[efficient_sam_point_output_image, sam_point_output_image],
         )
+    submit_box_inference_button.click(
+        efficient_sam_box_inference,
+        inputs=box_inputs,
+        outputs=efficient_sam_box_output_image
     )
+    submit_box_inference_button.click(
+        sam_box_inference,
+        inputs=box_inputs,
+        outputs=sam_box_output_image
     )
+    submit_point_inference_button.click(
+        efficient_sam_point_inference,
+        inputs=point_inputs,
+        outputs=efficient_sam_point_output_image
+    )
+    submit_point_inference_button.click(
+        sam_point_inference,
+        inputs=point_inputs,
+        outputs=sam_point_output_image
+    )
+    box_input_image.change(
+        clear,
+        inputs=box_input_image,
+        outputs=[efficient_sam_box_output_image, sam_box_output_image]
+    )
+    point_input_image.change(
         clear,
+        inputs=point_input_image,
+        outputs=[efficient_sam_point_output_image, sam_point_output_image]
     )
 demo.launch(debug=False, show_error=True)

utils/draw.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Tuple
+import cv2
+import numpy as np
+import supervision as sv
+def draw_circle(
+    scene: np.ndarray, center: sv.Point, color: sv.Color, radius: int = 2
+) -> np.ndarray:
+    cv2.circle(
+        scene,
+        center=center.as_xy_int_tuple(),
+        radius=radius,
+        color=color.as_bgr(),
+        thickness=-1,
+    )
+    return scene
+def calculate_dynamic_circle_radius(resolution_wh: Tuple[int, int]) -> int:
+    min_dimension = min(resolution_wh)
+    if min_dimension < 480:
+        return 4
+    if min_dimension < 720:
+        return 8
+    if min_dimension < 1080:
+        return 8
+    if min_dimension < 2160:
+        return 16
+    else:
+        return 16

utils/efficient_sam.py CHANGED Viewed

@@ -45,3 +45,36 @@ def inference_with_box(
             max_predicted_iou = curr_predicted_iou
             selected_mask_using_predicted_iou = all_masks[m]
     return selected_mask_using_predicted_iou

             max_predicted_iou = curr_predicted_iou
             selected_mask_using_predicted_iou = all_masks[m]
     return selected_mask_using_predicted_iou
+def inference_with_point(
+    image: np.ndarray,
+    point: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device
+) -> np.ndarray:
+    pts_sampled = torch.reshape(torch.tensor(point), [1, 1, -1, 2])
+    max_num_pts = pts_sampled.shape[2]
+    pts_labels = torch.ones(1, 1, max_num_pts)
+    img_tensor = ToTensor()(image)
+    predicted_logits, predicted_iou = model(
+        img_tensor[None, ...].to(device),
+        pts_sampled.to(device),
+        pts_labels.to(device),
+    )
+    predicted_logits = predicted_logits.cpu()
+    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
+    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
+    max_predicted_iou = -1
+    selected_mask_using_predicted_iou = None
+    for m in range(all_masks.shape[0]):
+        curr_predicted_iou = predicted_iou[m]
+        if (
+                curr_predicted_iou > max_predicted_iou
+                or selected_mask_using_predicted_iou is None
+        ):
+            max_predicted_iou = curr_predicted_iou
+            selected_mask_using_predicted_iou = all_masks[m]
+    return selected_mask_using_predicted_iou