Audio out of sync

Hello, I have cloned your repo and it sounds amazing as an idea, but there were so many issues, the main one is the audio/video sync issue, I have  attempted to fix this actually.
Also, the new openai sdk requires different approach, here is the new version, you may view it.

import sys
import numpy as np
from pytube import YouTube
import cv2
import subprocess
import openai
import json
from datetime import datetime
import os
from os import path
import shutil
import json
import argparse

from dotenv import load_dotenv
load_dotenv()
import openai
import os

client = openai.OpenAI(api_key="")

# Download video
def download_video(url, filename):
    yt = YouTube(url)
    video = yt.streams.filter(file_extension='mp4').get_highest_resolution()
    video.download(filename=filename, output_path='tmp/')

def generate_segments(response):
    for i, segment in enumerate(response):
        print(i, segment)

        start_time = segment.get("start_time", "00:00:00").split('.')[0]
        end_time = segment.get("end_time", "00:00:00").split('.')[0]

        pt = datetime.strptime(start_time, '%H:%M:%S')
        start_time = pt.second + pt.minute * 60 + pt.hour * 3600

        pt = datetime.strptime(end_time, '%H:%M:%S')
        end_time = pt.second + pt.minute * 60 + pt.hour * 3600

        if end_time - start_time < 50:
            end_time += (50 - (end_time - start_time))

        output_file = f"output{str(i).zfill(3)}.mp4"

        # 🛠 Correct audio-video sync by applying -ss before -i
        command = (
            f"ffmpeg -y -hwaccel cuda -ss {start_time} -to {end_time} -i tmp/input_video.mp4 "
            f"-vf scale=1920:1080 -qscale:v 3 -b:v 6000k -c:v libx264 -c:a aac -strict experimental "
            f"-map 0:v -map 0:a tmp/{output_file}"
        )

        subprocess.call(command, shell=True)

def generate_short(input_file, output_file):
    try:
        switch_interval = 150
        frame_count = 0
        current_face_index = 0

        CROP_RATIO_BIG = 1
        CROP_RATIO_SMALL = 0.5
        VERTICAL_RATIO = 9 / 16

        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

        cap = cv2.VideoCapture(f"tmp/{input_file}")
        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        original_fps = cap.get(cv2.CAP_PROP_FPS)  # ✅ Get actual FPS

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(f"tmp/{output_file}", fourcc, original_fps, (1080, 1920))  # ✅ Preserve original FPS

        face_center = (frame_width // 2, frame_height // 2)
        face_positions = []

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % switch_interval == 0:
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=7, minSize=(100, 100))

                if len(faces) > 0:
                    face_positions = [(x, y, w, h) for (x, y, w, h) in faces]
                    current_face_index = (current_face_index + 1) % len(face_positions)
                    x, y, w, h = face_positions[current_face_index]
                    face_center = (x + w // 2, y + h // 2)

            crop_x = max(0, face_center[0] - 540)
            crop_y = max(0, face_center[1] - 960)
            crop_x2 = min(crop_x + 1080, frame_width)
            crop_y2 = min(crop_y + 1920, frame_height)

            crop_img = frame[crop_y:crop_y2, crop_x:crop_x2]
            resized = cv2.resize(crop_img, (1080, 1920), interpolation=cv2.INTER_AREA)
            out.write(resized)
            frame_count += 1

        cap.release()
        out.release()
        cv2.destroyAllWindows()

        # Extract audio
        audio_file = f"tmp/audio_{input_file.split('.')[0]}.aac"
        command = f"ffmpeg -y -hwaccel cuda -i tmp/{input_file} -vn -acodec aac {audio_file}"
        subprocess.call(command, shell=True)

        # Merge video and audio with correct sync
        merged_output = f"tmp/final-{output_file}"
        command = (
            f"ffmpeg -y -hwaccel cuda -r {original_fps} -i tmp/{output_file} "
            f"-i {audio_file} -filter_complex \"[1:a]aresample=async=1,atempo=1[a]\" -map 0:v -map \"[a]\" "
            f"-c:v libx264 -c:a aac -strict experimental -shortest {merged_output} 2> tmp/ffmpeg_merge_error.log"
        )
        subprocess.call(command, shell=True)

        if not os.path.exists(merged_output):
            print(f"🚨 Error: Final output file {merged_output} was not created. Check tmp/ffmpeg_merge_error.log for details.")
            return

        # Subtitle embedding
        subtitle_file = f"tmp/{os.path.splitext(output_file)[0]}.srt"
        if os.path.exists(subtitle_file):
            final_output_with_subs = f"tmp/final-{os.path.splitext(output_file)[0]}_subtitled.mp4"
            command = f"ffmpeg -y -hwaccel cuda -i {merged_output} -vf subtitles={subtitle_file} -c:a copy {final_output_with_subs}"
            subprocess.call(command, shell=True)
            print(f"✅ Subtitled video saved as {final_output_with_subs}")
        else:
            print(f"🚨 Warning: No subtitle file found for {output_file}. Subtitles will be missing.")

    except Exception as e:
        print(f"Error during video cropping: {str(e)}")

def generate_viral(transcript):
    messages = [
        {"role": "system", "content": "You analyze video transcripts and identify viral segments."},
        {"role": "user", "content": f"Find the 3 most viral segments in this transcript and return a JSON response. The JSON should match this format: {json.dumps({'segments': [{'start_time': '00:00:00', 'end_time': '00:00:50', 'description': 'Engaging content', 'duration': 50}]}, indent=4)}. Transcript: {transcript}"}
    ]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=512
    )

    raw_response = response.choices[0].message.content
    print("DEBUG - GPT Raw Response:", raw_response)

    if raw_response.startswith("```json"):
        raw_response = raw_response[7:-3].strip()

    try:
        viral_segments = json.loads(raw_response)

        if isinstance(viral_segments, list):
            viral_segments = {"segments": viral_segments}

        if not isinstance(viral_segments, dict) or "segments" not in viral_segments:
            print("Error: OpenAI response is missing 'segments' key.")
            return None

        return viral_segments

    except json.JSONDecodeError:
        print("Error: OpenAI response is not valid JSON after cleanup.")
        return None

def generate_subtitle(input_file, output_folder, results_folder):
    auto_subtitle_path = "./venv/Scripts/auto_subtitle.exe"
    subtitle_file = f"{results_folder}/{output_folder}/{os.path.splitext(input_file)[0]}.srt"

    # ✅ Check if input video exists before running auto_subtitle
    if not os.path.exists(f"tmp/{input_file}"):
        print(f"🚨 Error: Video file {input_file} does not exist. Skipping subtitle generation.")
        return

    command = f'"{auto_subtitle_path}" tmp/{input_file} -o {results_folder}/{output_folder} --output_srt True --srt_only True --model medium'
    
    print(f"📢 Running command: {command}")
    subprocess.call(command, shell=True)

    # ✅ Debug: Print all files in the output directory
    if os.path.exists(f"{results_folder}/{output_folder}"):
        print("📂 DEBUG - Checking subtitle files in directory:", os.listdir(f"{results_folder}/{output_folder}"))

    # ✅ Wait for the subtitle file to be created
    for _ in range(15):
        if os.path.exists(subtitle_file):
            print("✅ Subtitle file generated successfully:", subtitle_file)
            return
        time.sleep(1)

    print("🚨 Error: Subtitle file was not created. Skipping subtitle embedding step.")


def generate_transcript(input_file):
    command = f"auto_subtitle tmp/{input_file} --srt_only True --output_srt True -o tmp/ --model medium"
    subprocess.call(command, shell=True)
    
    try:
        with open(f"tmp/{os.path.basename(input_file).split('.')[0]}.srt", 'r', encoding='utf-8') as file:
            transcript = file.read()
    except IOError:
        print("Error: Failed to read the input file.")
        sys.exit(1)
    return transcript

def __main__():
    parser = argparse.ArgumentParser(description='Create 3 reels or tiktoks from Youtube video')
    parser.add_argument('-v', '--video_id', required=False, help='Youtube video id. Ex: Cuptv7-A4p0 in https://www.youtube.com/watch?v=Cuptv7-A4p0')
    parser.add_argument('-f', '--file', required=False, help='Video file to be used')
    args = parser.parse_args()
    
    if not args.video_id and not args.file: 
        print('Needed at least one argument. <command> --help for help')
        sys.exit(1)
    
    if args.video_id and args.file:
        print('use --video_id or --file')
        sys.exit(1)

    try: 
        if os.path.exists("tmp"):
            shutil.rmtree("tmp")
        os.mkdir('tmp') 
    except OSError as error: 
        print(error)

    filename = 'input_video.mp4'
    if args.video_id:
        video_id = args.video_id
        url = 'https://www.youtube.com/watch?v='+video_id
        download_video(url, filename)
    
    if args.file:
        video_id = os.path.basename(args.file).split('.')[0]
        print(video_id)
        if path.exists(args.file):
            shutil.copy(args.file, "tmp/input_video.mp4")
        else:
            print(f"File {args.file} does not exist")
            sys.exit(1)

    output_folder = 'results'
    
    try: 
        os.mkdir(f"{output_folder}") 
    except OSError as error: 
        print(error)
    try: 
        os.mkdir(f"{output_folder}/{video_id}") 
    except OSError as error: 
        print(error)

    output_file = f"{output_folder}/{video_id}/content.txt"
    if not path.exists(output_file):
        transcript = generate_transcript(filename)
        print(transcript)
        
        viral_segments = generate_viral(transcript)
        content = viral_segments
        print("DEBUG - viral_segments:", viral_segments)

        try:
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(json.dumps(content, indent=4))
        except IOError:
            print("Error: Failed to write the output file.")
            sys.exit(1)
        print("Full transcription written to ", output_file)
    else:
        try:
            with open(output_file, 'r', encoding='utf-8') as file:
                content = file.read()
        except IOError:
            print("Error: Failed to read the input file.")
            sys.exit(1)

    if isinstance(content, str):
        parsed_content = json.loads(content)
    else:
        parsed_content = content

    if not isinstance(parsed_content, dict) or "segments" not in parsed_content:
        print("Error: Parsed content is not a valid JSON dictionary with 'segments'.")
        print("DEBUG - Parsed Content:", parsed_content)
        sys.exit(1)

    generate_segments(parsed_content["segments"])

    for i, segment in enumerate(parsed_content['segments']):
        input_file = f'output{str(i).zfill(3)}.mp4'
        output_file = f'output_cropped{str(i).zfill(3)}.mp4'
        generate_short(input_file, output_file)
        generate_subtitle(f"final-{output_file}", video_id, output_folder)

if __name__ == "__main__":
    __main__()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Audio out of sync #4

Download video

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Audio out of sync #4

Description

Download video

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions