Skip to content

Commit fe3f487

Browse files
committed
Extract from notebook
Add comments
1 parent bfb61a1 commit fe3f487

5 files changed

Lines changed: 336 additions & 1 deletion

File tree

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import pandas as pd
2+
import speech_recognition as sr
3+
from dotenv import load_dotenv
4+
from inaSpeechSegmenter import Segmenter
5+
6+
7+
class GenderAudioIdentifier:
8+
def __init__(self, path_to_file, path_to_audio):
9+
self.title = path_to_file.split(sep='\\')[-1].split(sep='.')[0]
10+
self.media = path_to_file
11+
self.audio = path_to_audio
12+
self.gendered_audio_seg = self.segment() # Dataframe
13+
self.dialogues = self.run_speech_to_text()
14+
self.speaking_time = self.compute_speaking_time_allocation()
15+
16+
def __str__(self):
17+
return "Film : {}".format(self.title)
18+
19+
def __repr__(self):
20+
return self.title
21+
22+
def segment(self):
23+
seg = Segmenter(vad_engine='sm', energy_ratio=0.05)
24+
# energy ratio : the higher, the more selective ; vad_engine : works better with sm than smn
25+
segment = seg(self.media)
26+
return pd.DataFrame(list(filter(lambda x: x[0] == 'male' or x[0] == 'female', segment)),
27+
columns=['gender', 'start', 'end'])
28+
29+
def search_gender_tag(self, time: int): # Give a time in seconds
30+
gender = None
31+
if time > self.gendered_audio_seg['end'].tail(1).item():
32+
return None
33+
for i in self.gendered_audio_seg.index:
34+
if time > self.gendered_audio_seg['start'][i]:
35+
if time < self.gendered_audio_seg['end'][i]:
36+
gender = self.gendered_audio_seg['gender'][i]
37+
if time > self.gendered_audio_seg['end'][i]:
38+
pass
39+
return gender
40+
41+
def compute_speaking_time_allocation(self):
42+
speaking_time = {'male': 0, 'female': 0}
43+
dif = pd.Series(self.gendered_audio_seg['end'] - self.gendered_audio_seg['start'], name='time_frame')
44+
totaldf = pd.concat([self.gendered_audio_seg['gender'], dif], axis=1)
45+
for i in totaldf.index:
46+
if totaldf['gender'][i] == 'male':
47+
speaking_time['male'] += float(totaldf['time_frame'][i])
48+
if totaldf['gender'][i] == 'female':
49+
speaking_time['female'] += float(totaldf['time_frame'][i])
50+
return speaking_time
51+
52+
def decode_speech(self, start_time=None, end_time=None, language="en-US"):
53+
r = sr.Recognizer()
54+
# r.pause_threshold = 3
55+
# r.dynamic_energy_adjustment_damping = 0.5
56+
# language can be "fr-FR"
57+
58+
with sr.WavFile(self.audio) as source:
59+
if start_time is None and end_time is None:
60+
audio_text = r.record(source)
61+
else:
62+
audio_text = r.record(source, duration=end_time - start_time, offset=start_time)
63+
64+
# recognize_() method will throw a request error if the API is unreachable, hence using exception handling
65+
try:
66+
# using google speech recognition
67+
text = r.recognize_google(audio_text, language=language)
68+
print('Converting audio transcripts into text ...')
69+
return text
70+
71+
except:
72+
print('Sorry.. run again...')
73+
74+
def run_speech_to_text(self):
75+
transcript = []
76+
for i in self.gendered_audio_seg.index:
77+
transcript.append(self.decode_speech(start_time=self.gendered_audio_seg['start'][i],
78+
end_time=self.gendered_audio_seg['end'][i],
79+
language='fr-FR'))
80+
transcription = pd.concat([self.gendered_audio_seg['gender'], pd.Series(transcript, name="transcription")],
81+
axis=1)
82+
return transcription
83+
84+
def export_to_csv(self, file_path: str):
85+
result = pd.concat([self.gendered_audio_seg, self.dialogues['transcription']], axis=1)
86+
result.to_csv(path_or_buf=file_path, sep=";", header=True, index=False)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from transformers import pipeline
2+
3+
4+
class SpeechRecognition:
5+
"""Speech recognition model for audio files."""
6+
7+
def __init__(self, model_name="openai/whisper-small"):
8+
"""Initialize speech recognition model.
9+
10+
Args:
11+
language (str): target language
12+
task (str): transcribe for same language or translate to another language
13+
model_name (str): Whisper model name. Defaults to "openai/whisper-small".
14+
"""
15+
self.pipe = pipeline(
16+
task="automatic-speech-recognition",
17+
model=model_name,
18+
chunk_length_s=30,
19+
stride_length_s=(5, 5),
20+
return_timestamps=True,
21+
)
22+
23+
def transcribe(self, audio_path, language, task="transcribe"):
24+
"""Transcribe audio file.
25+
26+
Args:
27+
audio_path (): Path to audio file
28+
language (str): target language
29+
task (str): transcribe for same language or translate to another language
30+
31+
Returns:
32+
Dict: Transcribed text
33+
"""
34+
self.pipe.model.config.forced_decoder_ids = (
35+
self.pipe.tokenizer.get_decoder_prompt_ids(language=language, task=task)
36+
)
37+
return self.pipe(audio_path)

bechdelai/audio/utils.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
3+
import moviepy.editor as mp
4+
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
5+
6+
7+
def cut_and_save(movie_path: str, start: float, end: float, target_name: str) -> None:
8+
"""This function cuts a video from the start to the end time and saves it as target_name.
9+
10+
Args:
11+
movie_path (str): The path to the video file.
12+
start (float): The start time in seconds.
13+
end (float): The end time in seconds.
14+
target_name (str): The file name of the new video file.
15+
16+
Returns:
17+
None
18+
"""
19+
return ffmpeg_extract_subclip(movie_path, start, end, targetname=target_name)
20+
21+
22+
def import_as_clip(path_to_video: str) -> mp.VideoFileClip:
23+
"""Imports a video file as a VideoFileClip object.
24+
25+
Args:
26+
path_to_video (str): Path to a video file.
27+
28+
Returns:
29+
mp.VideoFileClip: VideoFileClip object.
30+
"""
31+
return mp.VideoFileClip(path_to_video)
32+
33+
34+
# Splits a file into its individual parts using spleeter
35+
# Does not work above 700 seconds
36+
def separate_voice_and_music(file: str) -> None: # Do not work above 700 seconds
37+
os.system('spleeter separate -d 700.0 -o ../../../ -f "{instrument}/{filename}.{codec}" ' + file)
38+
39+
40+
def extract_audio_from_movie(file: str, extension: str = '.wav') -> None:
41+
"""Extract the audio from a movie and save it to a file.
42+
43+
The audio is saved in the same directory as the movie.
44+
45+
Args:
46+
file (str): The name of the movie file to extract the audio from.
47+
extension (str): The file extension of the audio file to save.
48+
"""
49+
clip = import_as_clip(file)
50+
clip.audio.write_audiofile(file.split(sep='.')[0] + extension)
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"%load_ext autoreload\n",
10+
"%autoreload 2"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"from bechdelai.data.youtube import download_youtube_video"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"youtube_trailer_url = \"https://www.youtube.com/watch?v=EzWIsGqeoVQ\"\n",
29+
"output_filename = \"raid.mp4\"\n",
30+
"youtube_language = \"fr-FR\""
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 4,
36+
"metadata": {},
37+
"outputs": [
38+
{
39+
"name": "stdout",
40+
"output_type": "stream",
41+
"text": [
42+
"Task Completed!\n"
43+
]
44+
}
45+
],
46+
"source": [
47+
"download_youtube_video(youtube_trailer_url, output_filename, youtube_language)"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": 2,
53+
"metadata": {},
54+
"outputs": [
55+
{
56+
"name": "stderr",
57+
"output_type": "stream",
58+
"text": [
59+
"/home/thomas/miniconda3/envs/bechdelai/lib/python3.9/site-packages/transformers/generation/utils.py:1273: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 448 (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
60+
" warnings.warn(\n"
61+
]
62+
},
63+
{
64+
"data": {
65+
"text/plain": [
66+
"{'text': \" Leur Raid, l'élite de la police, des super-agences sur-entraînées. Leur devise, servir sans faillir. Suivant. Bonjour. Ah, c'est pour le secrétairelien, l'infirmiereau ou la cantine ? Je suis là pour le groupe d'intervention du Raid. C'est un danger pour le groupe, une gonzesse. Une femme froissale. Ça crie, ça chiale, ça se pète les ondes, ça se poince les cheveux dans le casque. Bon, on a une femme, c'est comme ça. Oh non ! Puis ça, tout le temps, envie de pisser. On n'a pas les combinaisons adaptées pour les pisseuses. Effectivement, vous n'êtes pas du tout macho. Je me suis trompé. Mais ? Vous êtes 16. Y a 4 lits par chambre. Faites-moi tout de suite 4 groupes de 4, s'il vous plaît. Alors ça, c'est un groupe de 16. I've been looking for her file. She's got very high quality. Do you know that? That's annoying. Hide it. And the fact that she wears the same name as the Minister of the Interior? But it has nothing to do with it. It's her daughter, but it has no relationship. I warn you, on the first occasion, I'll fire her. We have to go get her. No, it's good, look, she's still making bubbles. En casion, je la fiers. Il faut aller la chercher là. Non, c'est beau, regarde, elle fait encore des bulles. Pardon ! Elle a, elle est éliminée, elle a... Ah bah non. L'ennemi est neutralisé là. She's finished. Oh, no. The enemy is neutralized.\",\n",
67+
" 'chunks': [{'text': \" Leur Raid, l'élite de la police, des super-agences sur-entraînées.\",\n",
68+
" 'timestamp': (0.0, 5.0)},\n",
69+
" {'text': ' Leur devise, servir sans faillir.', 'timestamp': (5.0, 8.0)},\n",
70+
" {'text': ' Suivant.', 'timestamp': (8.0, 9.0)},\n",
71+
" {'text': ' Bonjour.', 'timestamp': (9.0, 10.0)},\n",
72+
" {'text': \" Ah, c'est pour le secrétairelien, l'infirmiereau ou la cantine ?\",\n",
73+
" 'timestamp': (10.0, 12.0)},\n",
74+
" {'text': \" Je suis là pour le groupe d'intervention du Raid.\",\n",
75+
" 'timestamp': (12.0, 14.0)},\n",
76+
" {'text': \" C'est un danger pour le groupe, une gonzesse.\",\n",
77+
" 'timestamp': (14.0, 16.0)},\n",
78+
" {'text': ' Une femme froissale.', 'timestamp': (16.0, 17.0)},\n",
79+
" {'text': ' Ça crie, ça chiale, ça se pète les ondes, ça se poince les cheveux dans le casque.',\n",
80+
" 'timestamp': (17.0, 22.0)},\n",
81+
" {'text': \" Bon, on a une femme, c'est comme ça.\", 'timestamp': (22.0, 24.0)},\n",
82+
" {'text': ' Oh non ! Puis ça, tout le temps, envie de pisser.',\n",
83+
" 'timestamp': (24.0, 26.0)},\n",
84+
" {'text': \" On n'a pas les combinaisons adaptées pour les pisseuses.\",\n",
85+
" 'timestamp': (26.0, 28.0)},\n",
86+
" {'text': \" Effectivement, vous n'êtes pas du tout macho.\",\n",
87+
" 'timestamp': (28.0, 30.0)},\n",
88+
" {'text': ' Je me suis trompé.', 'timestamp': (30.0, 31.0)},\n",
89+
" {'text': ' Mais ?', 'timestamp': (31.0, 32.0)},\n",
90+
" {'text': ' Vous êtes 16.', 'timestamp': (32.0, 33.0)},\n",
91+
" {'text': ' Y a 4 lits par chambre.', 'timestamp': (33.0, 35.0)},\n",
92+
" {'text': \" Faites-moi tout de suite 4 groupes de 4, s'il vous plaît.\",\n",
93+
" 'timestamp': (35.0, 37.0)},\n",
94+
" {'text': \" Alors ça, c'est un groupe de 16.\", 'timestamp': (37.0, 42.0)},\n",
95+
" {'text': \" I've been looking for her file. She's got very high quality.\",\n",
96+
" 'timestamp': (42.0, 45.0)},\n",
97+
" {'text': ' Do you know that?', 'timestamp': (45.0, 46.0)},\n",
98+
" {'text': \" That's annoying.\", 'timestamp': (46.0, 47.0)},\n",
99+
" {'text': ' Hide it.', 'timestamp': (47.0, 48.0)},\n",
100+
" {'text': ' And the fact that she wears the same name as the Minister of the Interior?',\n",
101+
" 'timestamp': (48.0, 50.0)},\n",
102+
" {'text': ' But it has nothing to do with it.', 'timestamp': (50.0, 51.0)},\n",
103+
" {'text': \" It's her daughter, but it has no relationship.\",\n",
104+
" 'timestamp': (51.0, 53.0)},\n",
105+
" {'text': \" I warn you, on the first occasion, I'll fire her.\",\n",
106+
" 'timestamp': (53.0, 55.0)},\n",
107+
" {'text': ' We have to go get her.', 'timestamp': (55.0, 56.0)},\n",
108+
" {'text': \" No, it's good, look, she's still making bubbles.\",\n",
109+
" 'timestamp': (56.0, 59.0)},\n",
110+
" {'text': ' En casion, je la fiers.', 'timestamp': (59.0, 60.0)},\n",
111+
" {'text': ' Il faut aller la chercher là.', 'timestamp': (60.0, 61.0)},\n",
112+
" {'text': \" Non, c'est beau, regarde, elle fait encore des bulles.\",\n",
113+
" 'timestamp': (61.0, 63.0)},\n",
114+
" {'text': ' Pardon !', 'timestamp': (63.0, 63.5)},\n",
115+
" {'text': ' Elle a, elle est éliminée, elle a...', 'timestamp': (63.5, 65.0)},\n",
116+
" {'text': ' Ah bah non.', 'timestamp': (65.0, 65.5)},\n",
117+
" {'text': \" L'ennemi est neutralisé là.\", 'timestamp': (65.5, 66.5)},\n",
118+
" {'text': \" She's finished.\", 'timestamp': (66.5, 68.3)},\n",
119+
" {'text': ' Oh, no. The enemy is neutralized.', 'timestamp': (68.3, 70.7)}]}"
120+
]
121+
},
122+
"execution_count": 2,
123+
"metadata": {},
124+
"output_type": "execute_result"
125+
}
126+
],
127+
"source": [
128+
"from bechdelai.audio.speech_recognition import SpeechRecognition\n",
129+
"\n",
130+
"sr = SpeechRecognition()\n",
131+
"sr.transcribe(output_filename, \"fr\")\n"
132+
]
133+
}
134+
],
135+
"metadata": {
136+
"kernelspec": {
137+
"display_name": "bechdelai",
138+
"language": "python",
139+
"name": "python3"
140+
},
141+
"language_info": {
142+
"codemirror_mode": {
143+
"name": "ipython",
144+
"version": 3
145+
},
146+
"file_extension": ".py",
147+
"mimetype": "text/x-python",
148+
"name": "python",
149+
"nbconvert_exporter": "python",
150+
"pygments_lexer": "ipython3",
151+
"version": "3.9.16"
152+
},
153+
"orig_nbformat": 4,
154+
"vscode": {
155+
"interpreter": {
156+
"hash": "31ffc711ab2ee07bd298f523dc1dd63ebc15cb1e136e0e7de381fff9c93dfdff"
157+
}
158+
}
159+
},
160+
"nbformat": 4,
161+
"nbformat_minor": 2
162+
}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ authors = ["Théo Alves Da Costa <theo.alves.da.costa@gmail.com>"]
66
license = "MIT"
77

88
[tool.poetry.dependencies]
9-
python = ">=3.8,<3.10"
9+
python = ">=3.8,<3.11"
1010
jupyter = "^1.0.0"
1111
pandas = "^1.3.4"
1212
numpy = "^1.21.3"

0 commit comments

Comments
 (0)