from __future__ import annotations
import json
import os
from os import PathLike
import pandas as pd
import torch
from expert.core.congruence.audio_emotions.audio_analysis import AudioAnalysis
from expert.core.congruence.text_emotions.text_analysis import get_text_emotions
from expert.core.congruence.video_emotions.video_analysis import (
get_video_emotions,
)
def align_timestamps(time_sec):
return time_sec - time_sec % 10
[docs]class CongruenceDetector:
"""Determination of expert emotions congruence.
Args:
video_path (str | PathLike): Path to local video file.
features_path (str | PathLike): Path to JSON file with information about detected faces.
face_image (str | PathLike): Path to face image selected by user.
transcription_path (str | PathLike): Path to JSON file with text transcription.
diarization_path (str | PathLike): Path to JSON file with diarization information.
lang (str, optional): Speech language for text processing ['ru', 'en']. Defaults to 'en'.
duration (int, optional): Length of intervals for extracting features. Defaults to 10.
sr (int, optional): Sample rate. Defaults to 16000.
device (torch.device | None, optional): Device type on local machine (GPU recommended). Defaults to None.
output_dir (str | Pathlike | None, optional): Path to the folder for saving results. Defaults to None.
Returns:
Tuple[str, str]: Paths to the emotion and congruence reports.
Raises:
NotImplementedError: If 'lang' is not equal to 'en' or 'ru'.
Example:
>>> import torch
>>> cong_detector = CongruenceDetector(
video_path="test_video.mp4",
features_path="temp/test_video/features.json",
face_image="temp/test_video/faces/0.jpg",
transcription_path="temp/test_video/transcription.json",
diarization_path="temp/test_video/diarization.json",
device=torch.device("cuda:0"),
)
>>> cong_detector.get_congruence()
("temp/test_video/emotions.json", "temp/test_video/congruence.json")
"""
def __init__(
self,
video_path: str | PathLike,
features_path: str | PathLike,
face_image: str | PathLike,
transcription_path: str | PathLike,
diarization_path: str | PathLike,
lang: str = "en",
duration: int = 10,
sr: int = 44100,
device: torch.device | None = None,
output_dir: str | PathLike | None = None,
):
if lang not in ["en", "ru"]:
raise NotImplementedError("'lang' must be 'en' or 'ru'.")
self.lang = lang
self.video_path = video_path
self.features_path = features_path
self.face_image = face_image
self.transcription_path = transcription_path
self.duration = duration
self.sr = sr
self._device = torch.device("cpu")
if device is not None:
self._device = device
with open(diarization_path, "r") as file:
self.stamps = json.load(file)
if output_dir is not None:
self.temp_path = output_dir
else:
basename = os.path.splitext(os.path.basename(video_path))[0]
self.temp_path = os.path.join("temp", basename)
if not os.path.exists(self.temp_path):
os.makedirs(self.temp_path)
@property
def device(self) -> torch.device:
"""Check the device type.
Returns:
torch.device: Device type on local machine.
"""
return self._device
def get_video_state(self):
video_data, key = get_video_emotions(
video_path=self.video_path,
features_path=self.features_path,
face_image=self.face_image,
device=self._device,
duration=self.duration,
)
video_data = pd.DataFrame(data=video_data)
video_data["time_sec"] = video_data["time_sec"].apply(align_timestamps)
video_data = (
video_data.drop_duplicates(subset=["time_sec"])
.sort_values(by="time_sec")
.reset_index(drop=True)
)
return video_data, key
def get_audio_state(self, key):
audio_model = AudioAnalysis(
video_path=self.video_path,
stamps=self.stamps,
speaker=key,
sr=self.sr,
duration=self.duration,
device=self._device,
)
audio_data = audio_model.predict()
audio_data = pd.DataFrame(data=audio_data)
audio_data["time_sec"] = audio_data["time_sec"].apply(align_timestamps)
audio_data = (
audio_data.drop_duplicates(subset=["time_sec"])
.sort_values(by="time_sec")
.reset_index(drop=True)
)
return audio_data
def get_text_state(self, key):
text_data = get_text_emotions(
words_path=self.transcription_path,
stamps=self.stamps,
key=key,
device=self._device,
duration=self.duration,
)
text_data = pd.DataFrame(data=text_data)
text_data["time_sec"] = text_data["time_sec"].apply(align_timestamps)
text_data = (
text_data.drop_duplicates(subset=["time_sec"])
.sort_values(by="time_sec")
.reset_index(drop=True)
)
return text_data
def get_congruence(self):
video_data, key = self.get_video_state()
audio_data = self.get_audio_state(key=key)
text_data = self.get_text_state(key=key)
cong_data = video_data.join(
audio_data.set_index("time_sec"), how="inner", on="time_sec"
)
cong_data = cong_data.join(
text_data.set_index("time_sec"), how="inner", on="time_sec"
)
cong_data = cong_data.sort_values(by="time_sec").reset_index(drop=True)
neutral_std = cong_data.loc[
:, ["video_neutral", "audio_neutral", "text_neutral"]
].std(axis=1)
anger_std = cong_data.loc[
:, ["video_anger", "audio_anger", "text_anger"]
].std(axis=1)
happiness_std = cong_data.loc[
:, ["video_happiness", "audio_happiness", "text_happiness"]
].std(axis=1)
# Calculate the sum of deviations between emotions and normalize the value.
cong_data["congruence"] = (
neutral_std + anger_std + happiness_std
) / 1.5
# Get and save data with all emotions.
emotions_data = dict()
emotions_data["video"] = video_data.to_dict(orient="records")
emotions_data["audio"] = audio_data.to_dict(orient="records")
emotions_data["text"] = text_data.to_dict(orient="records")
with open(
os.path.join(self.temp_path, "emotions.json"), "w"
) as filename:
json.dump(emotions_data, filename)
cong_data[["video_path", "time_sec", "congruence"]].to_json(
os.path.join(self.temp_path, "congruence.json"), orient="records"
)
return os.path.join(self.temp_path, "emotions.json"), os.path.join(
self.temp_path, "congruence.json"
)