Source code for expert.data.detection.face_detector

from __future__ import annotations

from typing import List, Optional, Union

import albumentations as A
import cv2
import mediapipe
import numpy as np
import torch
from albumentations.pytorch.transforms import ToTensorV2

from expert.data.detection.inception_resnet_v1 import InceptionResnetV1


[docs]class FaceDetector:
    """Face detection and embedding implementation.

    FaceDetector processes an BGR image and returns a list of the detected face embeddings and bounding boxes.

    Example:
        >>> face_detector = FaceDetector(model_selection=0, min_detection_confidence=0.9)
    """

    def __init__(
        self,
        model_selection: Optional[int] = 0,
        min_detection_confidence: Optional[float] = 0.75,
        max_num_faces: Optional[int] = 10,
        device: Optional[Union[torch.device, None]] = None,
    ) -> None:
        """
        Args:
            model_selection (Optional[int]): 0 or 1. 0 to select a short-range model that works
                best for faces within 2 meters from the camera, and 1 for a full-range
                model best for faces within 5 meters. Defaults to 0.
            min_detection_confidence (Optional[float]): Minimum confidence value ([0.0, 1.0]) for face
                detection to be considered successful. Defaults to 0.75.
            max_num_faces (Optional[int]): Maximum number of faces to detect. Defaults to 10.
            device (Optional[Union[torch.device, None]): Device type on local machine (GPU recommended). Defaults to None.
        """
        super().__init__()

        self.max_num_faces = max_num_faces
        face_detector = mediapipe.solutions.face_detection
        self.face_detector = face_detector.FaceDetection(
            model_selection=model_selection,
            min_detection_confidence=min_detection_confidence,
        )

        # Initialize InceptionResnetV1 on GPU device if available.
        self._device = torch.device("cpu")
        if device is not None:
            self._device = device

        self.face_embedder = InceptionResnetV1(
            pretrained="vggface2", device=self._device
        ).eval()

        # Declare an augmentation pipeline.
        self.transform = A.Compose(
            [
                A.Resize(width=224, height=224),
                A.Normalize(
                    mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
                ),
                ToTensorV2(),
            ]
        )

    @property
    def device(self) -> torch.device:
        """Check the device type.

        Returns:
            torch.device: Device type on local machine.
        """
        return self._device

[docs]    @torch.no_grad()
    def detect(self, image: np.ndarray) -> List:
        """
        Args:
            image (np.ndarray): RGB image represented as numpy ndarray.

        Returns:
            List: List with detected face locations.
        """

        face_array = []
        image_height, image_width = image.shape[:2]
        prediction = self.face_detector.process(image)

        if prediction.detections:
            for n, idx in zip(
                range(self.max_num_faces), range(len(prediction.detections))
            ):
                bounding_box = prediction.detections[
                    idx
                ].location_data.relative_bounding_box
                face_location = [
                    [
                        int(bounding_box.xmin * image_width),
                        int(bounding_box.ymin * image_height),
                    ],
                    [
                        int(bounding_box.width * image_width),
                        int(bounding_box.height * image_height),
                    ],
                ]

                if sum([sum(loc) for loc in face_location]) == sum(
                    [sum(map(abs, loc)) for loc in face_location]
                ):
                    face_array.append(face_location)

        return face_array

[docs]    @torch.no_grad()
    def embed(self, image: np.ndarray) -> List:
        """Cropping and embedding area where the face is located.

        Args:
            image (np.ndarray): BGR image represented as numpy ndarray.

        Returns:
            List: List with detected face locations and embeddings.
        """

        face_batch = []
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        face_array = self.detect(image=image)

        if face_array is not None:
            for face_location in face_array:
                face_image = image[
                    face_location[0][1] : face_location[0][1]
                    + face_location[1][1],
                    face_location[0][0] : face_location[0][0]
                    + face_location[1][0],
                ]

                transformed_face = self.transform(image=face_image)["image"]
                in_face = transformed_face.unsqueeze(0).to(self._device)
                face_emb = (
                    self.face_embedder(in_face)[0].detach().cpu().tolist()
                )

                face_batch.append([face_emb, face_location])

        return face_batch