Have you ever recorded a video and wished you could easily remove unwanted filler words like “umm,” “uhh,” or “aaa” — without manually scrubbing through an editor? It can be frustrating and time-consuming. That's why I built a solution using WhisperModel from OpenAI.

What Do We Need to Get Started?
For this demo, we’ll use Gradio to build the interface and Whisper to transcribe the video. We'll also use moviepy for cutting and editing the video.
What Is Gradio?
Gradio is a Python library that allows data scientists and ML engineers to create web UIs for their models with minimal code. It supports a wide range of ML frameworks like PyTorch and TensorFlow.
You can create something like a text classifier in just a few lines:
import gradio as gr
# Load your model
model = load_model()
# Define prediction function
def predict_text(input_text):
prediction = model.predict(input_text)
return prediction
# Create the Gradio interface
iface = gr.Interface(fn=predict_text, inputs="text", outputs="text")
iface.launch()
What Is WhisperModel?
Whisper is an automatic speech recognition (ASR) system by OpenAI. It has a Python wrapper called faster-whisper, which we’ll use to transcribe video audio into text.
📦 Install Requirements
pip install SpeechRecognition moviepy faster-whisper==0.7.0 gradio
🔧 Create Helper Functions
We’ll isolate the core logic in a file called helpers.py.
from faster_whisper import WhisperModel
import moviepy.editor as mp
import re
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.compositing.concatenate import concatenate_videoclips
Load the Model
def load_model(model_size="medium"):
return WhisperModel(model_size)
Transcribe the Video
def transribe(video_path, model, audio_path='audio.wav'):
video = mp.VideoFileClip(video_path)
audio_file = video.audio
audio_file.write_audiofile(audio_path)
segments, info = model.transcribe(audio_path, word_timestamps=True)
return list(segments)
Map Words with Timestamps
def mapping_segments(segments):
subtitles_word = {}
transcript = []
for segment in segments:
for word in segment.words:
clean_word = re.sub(r'[^\w\s]', '', word.word.strip())
subtitles_word[f"{word.start}-{word.end}"] = clean_word
transcript.append(clean_word)
return subtitles_word, transcript
Find Time Ranges of Removed Words
def find_time_range_cutted(subtitles_word, edited_script_list_word):
tracked_index = 0
time_range_to_cut = []
for i, (range_, sub) in enumerate(subtitles_word.items()):
if tracked_index < len(edited_script_list_word) and sub == edited_script_list_word[tracked_index]:
tracked_index += 1
else:
time_range_to_cut.append(range_)
return time_range_to_cut
Cut the Video Based on Time Ranges
def cut_video(input_video, output_video, cut_ranges):
video_clip = VideoFileClip(input_video)
cut_clips = [video_clip.subclip(float(start), float(end)) for start, end in cut_ranges]
final_clip = concatenate_videoclips(cut_clips)
final_clip.write_videofile(output_video, codec="libx264", audio_codec="aac")
🧠 Gradio App Logic
Process Video to Text
def process_video(video_file):
segments = transribe(video_file, model)
subtitles_word, list_words = mapping_segments(segments)
return ' '.join(list_words)
Edit the Video After Text is Modified
def edit_video(script, video_file):
segments = transribe(video_file, model)
subtitles_word_text, list_words = mapping_segments(segments)
cleaned_script = re.sub(r'[^\w\s]', '', script)
edited_words = [word for word in cleaned_script.split(' ') if word]
time_range_to_cut = find_time_range_cutted(subtitles_word_text, edited_words)
time_range_to_cut_cleaned = [(r.split('-')[0], r.split('-')[1]) for r in time_range_to_cut]
sorted_range = []
for range_time in time_range_to_cut_cleaned:
for r in range_time:
sorted_range.append(r)
if sorted_range:
start_range = (0, float(sorted_range[0]))
video_clip = VideoFileClip(video_file)
end_range = (float(sorted_range[-1]), video_clip.duration)
complete_range = [start_range]
new_ranges = sorted_range[1:-1]
for i in range(0, len(new_ranges) - 1, 2):
complete_range.append((float(new_ranges[i]), float(new_ranges[i+1])))
complete_range.append(end_range)
output_video_path = "output.mp4"
cut_video(video_file, output_video_path, complete_range)
return output_video_path
return video_file
🎛️ Build the Gradio Interface (app.py)
import gradio as gr
from helpers import *
model = load_model()
with gr.Blocks() as demo:
gr.HTML("<h2>Edit Your Videos Like a Spreadsheet Using <a href='https://huggingface.co/docs/transformers/model_doc/whisper'>WhisperModel</a></h2>")
gr.Markdown("Upload a video and click **Transcribe** to extract the full script. Edit the script to remove unwanted words, then click **Cut** to generate a new video.")
with gr.Row():
video_file = gr.Video(label="Upload Video")
script = gr.Textbox(label='Script')
results = gr.Video(label='Result')
with gr.Row():
transcribe = gr.Button('Transcribe')
cut_button = gr.Button('Cut')
transcribe.click(fn=process_video, inputs=video_file, outputs=script)
cut_button.click(fn=edit_video, inputs=[script, video_file], outputs=results)
gr.Markdown('Made by **Otman Heddouch**')
demo.launch(share=True, debug=True)
🎉 Final Notes
You can try the working demo on Hugging Face or explore the source code on GitHub.
If you have any questions or suggestions, feel free to reach out!
