Skip to content

Audio out of sync #4

@Emirborovac

Description

@Emirborovac

Hello, I have cloned your repo and it sounds amazing as an idea, but there were so many issues, the main one is the audio/video sync issue, I have attempted to fix this actually.
Also, the new openai sdk requires different approach, here is the new version, you may view it.

import sys
import numpy as np
from pytube import YouTube
import cv2
import subprocess
import openai
import json
from datetime import datetime
import os
from os import path
import shutil
import json
import argparse

from dotenv import load_dotenv
load_dotenv()
import openai
import os

client = openai.OpenAI(api_key="")

Download video

def download_video(url, filename):
yt = YouTube(url)
video = yt.streams.filter(file_extension='mp4').get_highest_resolution()
video.download(filename=filename, output_path='tmp/')

def generate_segments(response):
for i, segment in enumerate(response):
print(i, segment)

    start_time = segment.get("start_time", "00:00:00").split('.')[0]
    end_time = segment.get("end_time", "00:00:00").split('.')[0]

    pt = datetime.strptime(start_time, '%H:%M:%S')
    start_time = pt.second + pt.minute * 60 + pt.hour * 3600

    pt = datetime.strptime(end_time, '%H:%M:%S')
    end_time = pt.second + pt.minute * 60 + pt.hour * 3600

    if end_time - start_time < 50:
        end_time += (50 - (end_time - start_time))

    output_file = f"output{str(i).zfill(3)}.mp4"

    # 🛠 Correct audio-video sync by applying -ss before -i
    command = (
        f"ffmpeg -y -hwaccel cuda -ss {start_time} -to {end_time} -i tmp/input_video.mp4 "
        f"-vf scale=1920:1080 -qscale:v 3 -b:v 6000k -c:v libx264 -c:a aac -strict experimental "
        f"-map 0:v -map 0:a tmp/{output_file}"
    )

    subprocess.call(command, shell=True)

def generate_short(input_file, output_file):
try:
switch_interval = 150
frame_count = 0
current_face_index = 0

    CROP_RATIO_BIG = 1
    CROP_RATIO_SMALL = 0.5
    VERTICAL_RATIO = 9 / 16

    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    cap = cv2.VideoCapture(f"tmp/{input_file}")
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    original_fps = cap.get(cv2.CAP_PROP_FPS)  # ✅ Get actual FPS

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(f"tmp/{output_file}", fourcc, original_fps, (1080, 1920))  # ✅ Preserve original FPS

    face_center = (frame_width // 2, frame_height // 2)
    face_positions = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % switch_interval == 0:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=7, minSize=(100, 100))

            if len(faces) > 0:
                face_positions = [(x, y, w, h) for (x, y, w, h) in faces]
                current_face_index = (current_face_index + 1) % len(face_positions)
                x, y, w, h = face_positions[current_face_index]
                face_center = (x + w // 2, y + h // 2)

        crop_x = max(0, face_center[0] - 540)
        crop_y = max(0, face_center[1] - 960)
        crop_x2 = min(crop_x + 1080, frame_width)
        crop_y2 = min(crop_y + 1920, frame_height)

        crop_img = frame[crop_y:crop_y2, crop_x:crop_x2]
        resized = cv2.resize(crop_img, (1080, 1920), interpolation=cv2.INTER_AREA)
        out.write(resized)
        frame_count += 1

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    # Extract audio
    audio_file = f"tmp/audio_{input_file.split('.')[0]}.aac"
    command = f"ffmpeg -y -hwaccel cuda -i tmp/{input_file} -vn -acodec aac {audio_file}"
    subprocess.call(command, shell=True)

    # Merge video and audio with correct sync
    merged_output = f"tmp/final-{output_file}"
    command = (
        f"ffmpeg -y -hwaccel cuda -r {original_fps} -i tmp/{output_file} "
        f"-i {audio_file} -filter_complex \"[1:a]aresample=async=1,atempo=1[a]\" -map 0:v -map \"[a]\" "
        f"-c:v libx264 -c:a aac -strict experimental -shortest {merged_output} 2> tmp/ffmpeg_merge_error.log"
    )
    subprocess.call(command, shell=True)

    if not os.path.exists(merged_output):
        print(f"🚨 Error: Final output file {merged_output} was not created. Check tmp/ffmpeg_merge_error.log for details.")
        return

    # Subtitle embedding
    subtitle_file = f"tmp/{os.path.splitext(output_file)[0]}.srt"
    if os.path.exists(subtitle_file):
        final_output_with_subs = f"tmp/final-{os.path.splitext(output_file)[0]}_subtitled.mp4"
        command = f"ffmpeg -y -hwaccel cuda -i {merged_output} -vf subtitles={subtitle_file} -c:a copy {final_output_with_subs}"
        subprocess.call(command, shell=True)
        print(f"✅ Subtitled video saved as {final_output_with_subs}")
    else:
        print(f"🚨 Warning: No subtitle file found for {output_file}. Subtitles will be missing.")

except Exception as e:
    print(f"Error during video cropping: {str(e)}")

def generate_viral(transcript):
messages = [
{"role": "system", "content": "You analyze video transcripts and identify viral segments."},
{"role": "user", "content": f"Find the 3 most viral segments in this transcript and return a JSON response. The JSON should match this format: {json.dumps({'segments': [{'start_time': '00:00:00', 'end_time': '00:00:50', 'description': 'Engaging content', 'duration': 50}]}, indent=4)}. Transcript: {transcript}"}
]

response = client.chat.completions.create(
    model="gpt-4o",
    messages=messages,
    max_tokens=512
)

raw_response = response.choices[0].message.content
print("DEBUG - GPT Raw Response:", raw_response)

if raw_response.startswith("```json"):
    raw_response = raw_response[7:-3].strip()

try:
    viral_segments = json.loads(raw_response)

    if isinstance(viral_segments, list):
        viral_segments = {"segments": viral_segments}

    if not isinstance(viral_segments, dict) or "segments" not in viral_segments:
        print("Error: OpenAI response is missing 'segments' key.")
        return None

    return viral_segments

except json.JSONDecodeError:
    print("Error: OpenAI response is not valid JSON after cleanup.")
    return None

def generate_subtitle(input_file, output_folder, results_folder):
auto_subtitle_path = "./venv/Scripts/auto_subtitle.exe"
subtitle_file = f"{results_folder}/{output_folder}/{os.path.splitext(input_file)[0]}.srt"

# ✅ Check if input video exists before running auto_subtitle
if not os.path.exists(f"tmp/{input_file}"):
    print(f"🚨 Error: Video file {input_file} does not exist. Skipping subtitle generation.")
    return

command = f'"{auto_subtitle_path}" tmp/{input_file} -o {results_folder}/{output_folder} --output_srt True --srt_only True --model medium'

print(f"📢 Running command: {command}")
subprocess.call(command, shell=True)

# ✅ Debug: Print all files in the output directory
if os.path.exists(f"{results_folder}/{output_folder}"):
    print("📂 DEBUG - Checking subtitle files in directory:", os.listdir(f"{results_folder}/{output_folder}"))

# ✅ Wait for the subtitle file to be created
for _ in range(15):
    if os.path.exists(subtitle_file):
        print("✅ Subtitle file generated successfully:", subtitle_file)
        return
    time.sleep(1)

print("🚨 Error: Subtitle file was not created. Skipping subtitle embedding step.")

def generate_transcript(input_file):
command = f"auto_subtitle tmp/{input_file} --srt_only True --output_srt True -o tmp/ --model medium"
subprocess.call(command, shell=True)

try:
    with open(f"tmp/{os.path.basename(input_file).split('.')[0]}.srt", 'r', encoding='utf-8') as file:
        transcript = file.read()
except IOError:
    print("Error: Failed to read the input file.")
    sys.exit(1)
return transcript

def main():
parser = argparse.ArgumentParser(description='Create 3 reels or tiktoks from Youtube video')
parser.add_argument('-v', '--video_id', required=False, help='Youtube video id. Ex: Cuptv7-A4p0 in https://www.youtube.com/watch?v=Cuptv7-A4p0')
parser.add_argument('-f', '--file', required=False, help='Video file to be used')
args = parser.parse_args()

if not args.video_id and not args.file: 
    print('Needed at least one argument. <command> --help for help')
    sys.exit(1)

if args.video_id and args.file:
    print('use --video_id or --file')
    sys.exit(1)

try: 
    if os.path.exists("tmp"):
        shutil.rmtree("tmp")
    os.mkdir('tmp') 
except OSError as error: 
    print(error)

filename = 'input_video.mp4'
if args.video_id:
    video_id = args.video_id
    url = 'https://www.youtube.com/watch?v='+video_id
    download_video(url, filename)

if args.file:
    video_id = os.path.basename(args.file).split('.')[0]
    print(video_id)
    if path.exists(args.file):
        shutil.copy(args.file, "tmp/input_video.mp4")
    else:
        print(f"File {args.file} does not exist")
        sys.exit(1)

output_folder = 'results'

try: 
    os.mkdir(f"{output_folder}") 
except OSError as error: 
    print(error)
try: 
    os.mkdir(f"{output_folder}/{video_id}") 
except OSError as error: 
    print(error)

output_file = f"{output_folder}/{video_id}/content.txt"
if not path.exists(output_file):
    transcript = generate_transcript(filename)
    print(transcript)
    
    viral_segments = generate_viral(transcript)
    content = viral_segments
    print("DEBUG - viral_segments:", viral_segments)

    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(json.dumps(content, indent=4))
    except IOError:
        print("Error: Failed to write the output file.")
        sys.exit(1)
    print("Full transcription written to ", output_file)
else:
    try:
        with open(output_file, 'r', encoding='utf-8') as file:
            content = file.read()
    except IOError:
        print("Error: Failed to read the input file.")
        sys.exit(1)

if isinstance(content, str):
    parsed_content = json.loads(content)
else:
    parsed_content = content

if not isinstance(parsed_content, dict) or "segments" not in parsed_content:
    print("Error: Parsed content is not a valid JSON dictionary with 'segments'.")
    print("DEBUG - Parsed Content:", parsed_content)
    sys.exit(1)

generate_segments(parsed_content["segments"])

for i, segment in enumerate(parsed_content['segments']):
    input_file = f'output{str(i).zfill(3)}.mp4'
    output_file = f'output_cropped{str(i).zfill(3)}.mp4'
    generate_short(input_file, output_file)
    generate_subtitle(f"final-{output_file}", video_id, output_folder)

if name == "main":
main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions