Source code for expert.core.congruence.congruence_analysis

from __future__ import annotations

import json
import os
from os import PathLike

import pandas as pd
import torch

from expert.core.congruence.audio_emotions.audio_analysis import AudioAnalysis
from expert.core.congruence.text_emotions.text_analysis import get_text_emotions
from expert.core.congruence.video_emotions.video_analysis import (
    get_video_emotions,
)


def align_timestamps(time_sec):
    return time_sec - time_sec % 10


[docs]class CongruenceDetector: """Determination of expert emotions congruence. Args: video_path (str | PathLike): Path to local video file. features_path (str | PathLike): Path to JSON file with information about detected faces. face_image (str | PathLike): Path to face image selected by user. transcription_path (str | PathLike): Path to JSON file with text transcription. diarization_path (str | PathLike): Path to JSON file with diarization information. lang (str, optional): Speech language for text processing ['ru', 'en']. Defaults to 'en'. duration (int, optional): Length of intervals for extracting features. Defaults to 10. sr (int, optional): Sample rate. Defaults to 16000. device (torch.device | None, optional): Device type on local machine (GPU recommended). Defaults to None. output_dir (str | Pathlike | None, optional): Path to the folder for saving results. Defaults to None. Returns: Tuple[str, str]: Paths to the emotion and congruence reports. Raises: NotImplementedError: If 'lang' is not equal to 'en' or 'ru'. Example: >>> import torch >>> cong_detector = CongruenceDetector( video_path="test_video.mp4", features_path="temp/test_video/features.json", face_image="temp/test_video/faces/0.jpg", transcription_path="temp/test_video/transcription.json", diarization_path="temp/test_video/diarization.json", device=torch.device("cuda:0"), ) >>> cong_detector.get_congruence() ("temp/test_video/emotions.json", "temp/test_video/congruence.json") """ def __init__( self, video_path: str | PathLike, features_path: str | PathLike, face_image: str | PathLike, transcription_path: str | PathLike, diarization_path: str | PathLike, lang: str = "en", duration: int = 10, sr: int = 44100, device: torch.device | None = None, output_dir: str | PathLike | None = None, ): if lang not in ["en", "ru"]: raise NotImplementedError("'lang' must be 'en' or 'ru'.") self.lang = lang self.video_path = video_path self.features_path = features_path self.face_image = face_image self.transcription_path = transcription_path self.duration = duration self.sr = sr self._device = torch.device("cpu") if device is not None: self._device = device with open(diarization_path, "r") as file: self.stamps = json.load(file) if output_dir is not None: self.temp_path = output_dir else: basename = os.path.splitext(os.path.basename(video_path))[0] self.temp_path = os.path.join("temp", basename) if not os.path.exists(self.temp_path): os.makedirs(self.temp_path) @property def device(self) -> torch.device: """Check the device type. Returns: torch.device: Device type on local machine. """ return self._device def get_video_state(self): video_data, key = get_video_emotions( video_path=self.video_path, features_path=self.features_path, face_image=self.face_image, device=self._device, duration=self.duration, ) video_data = pd.DataFrame(data=video_data) video_data["time_sec"] = video_data["time_sec"].apply(align_timestamps) video_data = ( video_data.drop_duplicates(subset=["time_sec"]) .sort_values(by="time_sec") .reset_index(drop=True) ) return video_data, key def get_audio_state(self, key): audio_model = AudioAnalysis( video_path=self.video_path, stamps=self.stamps, speaker=key, sr=self.sr, duration=self.duration, device=self._device, ) audio_data = audio_model.predict() audio_data = pd.DataFrame(data=audio_data) audio_data["time_sec"] = audio_data["time_sec"].apply(align_timestamps) audio_data = ( audio_data.drop_duplicates(subset=["time_sec"]) .sort_values(by="time_sec") .reset_index(drop=True) ) return audio_data def get_text_state(self, key): text_data = get_text_emotions( words_path=self.transcription_path, stamps=self.stamps, key=key, device=self._device, duration=self.duration, ) text_data = pd.DataFrame(data=text_data) text_data["time_sec"] = text_data["time_sec"].apply(align_timestamps) text_data = ( text_data.drop_duplicates(subset=["time_sec"]) .sort_values(by="time_sec") .reset_index(drop=True) ) return text_data def get_congruence(self): video_data, key = self.get_video_state() audio_data = self.get_audio_state(key=key) text_data = self.get_text_state(key=key) cong_data = video_data.join( audio_data.set_index("time_sec"), how="inner", on="time_sec" ) cong_data = cong_data.join( text_data.set_index("time_sec"), how="inner", on="time_sec" ) cong_data = cong_data.sort_values(by="time_sec").reset_index(drop=True) neutral_std = cong_data.loc[ :, ["video_neutral", "audio_neutral", "text_neutral"] ].std(axis=1) anger_std = cong_data.loc[ :, ["video_anger", "audio_anger", "text_anger"] ].std(axis=1) happiness_std = cong_data.loc[ :, ["video_happiness", "audio_happiness", "text_happiness"] ].std(axis=1) # Calculate the sum of deviations between emotions and normalize the value. cong_data["congruence"] = ( neutral_std + anger_std + happiness_std ) / 1.5 # Get and save data with all emotions. emotions_data = dict() emotions_data["video"] = video_data.to_dict(orient="records") emotions_data["audio"] = audio_data.to_dict(orient="records") emotions_data["text"] = text_data.to_dict(orient="records") with open( os.path.join(self.temp_path, "emotions.json"), "w" ) as filename: json.dump(emotions_data, filename) cong_data[["video_path", "time_sec", "congruence"]].to_json( os.path.join(self.temp_path, "congruence.json"), orient="records" ) return os.path.join(self.temp_path, "emotions.json"), os.path.join( self.temp_path, "congruence.json" )