Source code for expert.core.congruence.video_emotions.video_analysis

from __future__ import annotations

import os
from os import PathLike
from typing import List, Tuple

import albumentations as A
import cv2
import pandas as pd
import torch
from albumentations.pytorch.transforms import ToTensorV2
from torch import nn
from timm.models.layers import conv2d_same
import timm

from expert.core.functional_tools import get_model_weights_url
from expert.data.video_reader import VideoReader


[docs]def get_video_emotions( video_path: str | PathLike, features_path: str | PathLike, face_image: str | PathLike, device: torch.device | None = None, duration: int = 10, ) -> Tuple[List, str]: """Classification of expert emotions on video. Args: video_path (str | PathLike): Path to local video file. features_path (str | PathLike): Path to JSON file with information about detected faces. face_image (str | PathLike): Path to face image selected by user. device (torch.device | None, optional): Device type on local machine (GPU recommended). Defaults to None. """ softmax = nn.Softmax(dim=1) model_name = "enet_b0_8_best_afew.pt" url = "https://github.com/HSE-asavchenko/face-emotion-recognition/raw/main/models/affectnet_emotions/enet_b0_8_best_afew.pt" cached_dir = get_model_weights_url(model_name=model_name, url=url) model = torch.load(cached_dir, map_location=torch.device("cpu")) model = model.to(device).eval() # Declare an augmentation pipeline. transforms = A.Compose( [ A.Resize(width=224, height=224), A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ToTensorV2(), ] ) video = VideoReader(video_path) features = pd.read_json(features_path) emo_report = [] emo_sep_report = [] expert_idx = int(os.path.splitext(os.path.basename(face_image))[0]) features = features[features["speaker_by_video"] == expert_idx].reset_index( drop=True ) key = ( features.groupby(by="speaker_by_audio") .count() .sort_values(by="speaker_by_video", ascending=False) .index[0] ) for row in range(len(features)): current_frame = cv2.cvtColor( video[features.loc[row, "frame_index"]], cv2.COLOR_BGR2RGB ) transformed = ( transforms(image=current_frame)["image"].unsqueeze(0).to(device) ) emotions = model(transformed) emotions = softmax(emotions)[0].cpu().detach() lim_emotions = softmax( torch.Tensor([[emotions[6], emotions[0], emotions[1]]]) )[0] if not lim_emotions.isnan().any().item(): lim_emotions = lim_emotions.numpy() emo_report.append( { "time_sec": features.loc[row, "time_sec"], "video_anger": lim_emotions[0], "video_neutral": lim_emotions[1], "video_happiness": lim_emotions[2], } ) else: emo_report.append( { "time_sec": features.loc[row, "time_sec"], "video_anger": 0.0, "video_neutral": 0.0, "video_happiness": 0.0, } ) emo_data = pd.DataFrame(data=emo_report) emo_data = emo_data.sort_values(by="time_sec").reset_index(drop=True) start = 0 for time in range(len(emo_data)): if emo_data["time_sec"][time] < emo_data["time_sec"][start] + duration: finish = time else: temp_data = emo_data.loc[start:finish, :].reset_index(drop=True) emo_sep_report.append( { "video_path": video_path, "time_sec": float(emo_data["time_sec"][start]), "video_anger": float(temp_data.mean()["video_anger"]), "video_neutral": float(temp_data.mean()["video_neutral"]), "video_happiness": float( temp_data.mean()["video_happiness"] ), } ) start = time return emo_sep_report, key