Fix audio sync

2023-08-21 01:08:36 -04:00 · 2023-08-21 01:08:36 -04:00 · 87e5ba4dcc
commit 87e5ba4dcc
parent b050624eee
7 changed files with 56 additions and 45 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .idea
 .assets
 .temp
+__pycache__/
+.vscode
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ python run.py [options]
 -o OUTPUT_PATH, --output OUTPUT_PATH                                                             specify the output file or directory
 --frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...]                                       choose from the available frame processors (choices: face_enhancer, face_swapper, frame_enhancer, ...)
 --ui-layouts UI_LAYOUTS [UI_LAYOUTS ...]                                                         choose from the available ui layouts (choices: benchmark, default, ...)
--keep-fps                                                                                       preserve the frames per second (fps) of the target
+--fps-cap                                                                                        cap the frames per second (fps) of the target to 30
 --keep-temp                                                                                      retain temporary frames after processing
 --skip-audio                                                                                     omit audio from the target
 --face-recognition {reference,many}                                                              specify the method for face recognition
--- a/facefusion/core.py
+++ b/facefusion/core.py
@ -34,7 +34,7 @@ def parse_args() -> None:
 	program.add_argument('-o', '--output', help = wording.get('output_help'), dest = 'output_path')
 	program.add_argument('--frame-processors', help = wording.get('frame_processors_help').format(choices = ', '.join(list_module_names('facefusion/processors/frame/modules'))), dest = 'frame_processors', default = ['face_swapper'], nargs='+')
 	program.add_argument('--ui-layouts', help = wording.get('ui_layouts_help').format(choices = ', '.join(list_module_names('facefusion/uis/layouts'))), dest = 'ui_layouts', default = ['default'], nargs='+')
-	program.add_argument('--keep-fps', help = wording.get('keep_fps_help'), dest = 'keep_fps', action='store_true')
+	program.add_argument('--fps-cap', help = wording.get('fps_cap_help'), dest = 'fps_cap', action='store_true')
 	program.add_argument('--keep-temp', help = wording.get('keep_temp_help'), dest = 'keep_temp', action='store_true')
 	program.add_argument('--skip-audio', help = wording.get('skip_audio_help'), dest = 'skip_audio', action='store_true')
 	program.add_argument('--face-recognition', help = wording.get('face_recognition_help'), dest = 'face_recognition', default = 'reference', choices = facefusion.choices.face_recognition)
@ -64,7 +64,7 @@ def parse_args() -> None:
 	facefusion.globals.headless = facefusion.globals.source_path is not None and facefusion.globals.target_path is not None and facefusion.globals.output_path is not None
 	facefusion.globals.frame_processors = args.frame_processors
 	facefusion.globals.ui_layouts = args.ui_layouts
-	facefusion.globals.keep_fps = args.keep_fps
+	facefusion.globals.fps_cap = args.fps_cap
 	facefusion.globals.keep_temp = args.keep_temp
 	facefusion.globals.skip_audio = args.skip_audio
 	facefusion.globals.face_recognition = args.face_recognition
@ -153,13 +153,11 @@ def process_video() -> None:
 	update_status(wording.get('creating_temp'))
 	create_temp(facefusion.globals.target_path)
 	# extract frames
-	if facefusion.globals.keep_fps:
-		fps = detect_fps(facefusion.globals.target_path)
-		update_status(wording.get('extracting_frames_fps').format(fps = fps))
-		extract_frames(facefusion.globals.target_path, fps)
-	else:
-		update_status(wording.get('extracting_frames_fps').format(fps = 30))
-		extract_frames(facefusion.globals.target_path)
+	fps = detect_fps(facefusion.globals.target_path)
+	if facefusion.globals.fps_cap and fps > 30:
+		fps = 30
+	update_status(wording.get('extracting_frames_fps').format(fps = fps))
+	extract_frames(facefusion.globals.target_path, fps)
 	# process frame
 	temp_frame_paths = get_temp_frame_paths(facefusion.globals.target_path)
 	if temp_frame_paths:
@ -171,25 +169,17 @@ def process_video() -> None:
 		update_status(wording.get('temp_frames_not_found'))
 		return
 	# create video
-	if facefusion.globals.keep_fps:
-		fps = detect_fps(facefusion.globals.target_path)
-		update_status(wording.get('creating_video_fps').format(fps = fps))
-		if not create_video(facefusion.globals.target_path, fps):
-			update_status(wording.get('creating_video_failed'))
-	else:
-		update_status(wording.get('creating_video_fps').format(fps = 30))
-		if not create_video(facefusion.globals.target_path):
-			update_status(wording.get('creating_video_failed'))
+	update_status(wording.get('creating_video_fps').format(fps = fps))
+	if not create_video(facefusion.globals.target_path, fps):
+		update_status(wording.get('creating_video_failed'))
+
 	# handle audio
 	if facefusion.globals.skip_audio:
 		move_temp(facefusion.globals.target_path, facefusion.globals.output_path)
 		update_status(wording.get('skipping_audio'))
 	else:
-		if facefusion.globals.keep_fps:
-			update_status(wording.get('restoring_audio'))
-		else:
-			update_status(wording.get('restoring_audio_issues'))
-		restore_audio(facefusion.globals.target_path, facefusion.globals.output_path)
+		update_status(wording.get('restoring_audio'))
+		restore_audio(facefusion.globals.target_path, facefusion.globals.output_path, fps)
 	# clear temp
 	update_status(wording.get('clearing_temp'))
 	clear_temp(facefusion.globals.target_path)
--- a/facefusion/globals.py
+++ b/facefusion/globals.py
@ -8,7 +8,7 @@ output_path : Optional[str] = None
 headless : Optional[bool] = None
 frame_processors : List[str] = []
 ui_layouts : List[str] = []
-keep_fps : Optional[bool] = None
+fps_cap : Optional[bool] = None
 keep_temp : Optional[bool] = None
 skip_audio : Optional[bool] = None
 face_recognition : Optional[FaceRecognition] = None
--- a/facefusion/uis/components/settings.py
+++ b/facefusion/uis/components/settings.py
@ -5,20 +5,20 @@ import facefusion.globals
 from facefusion import wording
 from facefusion.uis.typing import Update

-KEEP_FPS_CHECKBOX : Optional[gradio.Checkbox] = None
+FPS_CAP_CHECKBOX : Optional[gradio.Checkbox] = None
 KEEP_TEMP_CHECKBOX : Optional[gradio.Checkbox] = None
 SKIP_AUDIO_CHECKBOX : Optional[gradio.Checkbox] = None


 def render() -> None:
-	global KEEP_FPS_CHECKBOX
+	global FPS_CAP_CHECKBOX
 	global KEEP_TEMP_CHECKBOX
 	global SKIP_AUDIO_CHECKBOX

 	with gradio.Box():
-		KEEP_FPS_CHECKBOX = gradio.Checkbox(
-			label = wording.get('keep_fps_checkbox_label'),
-			value = facefusion.globals.keep_fps
+		FPS_CAP_CHECKBOX = gradio.Checkbox(
+			label = wording.get('fps_cap_checkbox_label'),
+			value = facefusion.globals.fps_cap
 		)
 		KEEP_TEMP_CHECKBOX = gradio.Checkbox(
 			label = wording.get('keep_temp_checkbox_label'),
@ -31,7 +31,7 @@ def render() -> None:


 def listen() -> None:
-	KEEP_FPS_CHECKBOX.change(lambda value: update_checkbox('keep_fps', value), inputs = KEEP_FPS_CHECKBOX, outputs = KEEP_FPS_CHECKBOX)
+	FPS_CAP_CHECKBOX.change(lambda value: update_checkbox('fps_cap', value), inputs = FPS_CAP_CHECKBOX, outputs = FPS_CAP_CHECKBOX)
 	KEEP_TEMP_CHECKBOX.change(lambda value: update_checkbox('keep_temp', value), inputs = KEEP_TEMP_CHECKBOX, outputs = KEEP_TEMP_CHECKBOX)
 	SKIP_AUDIO_CHECKBOX.change(lambda value: update_checkbox('skip_audio', value), inputs = SKIP_AUDIO_CHECKBOX, outputs = SKIP_AUDIO_CHECKBOX)

--- a/facefusion/utilities.py
+++ b/facefusion/utilities.py
@ -51,11 +51,11 @@ def extract_frames(target_path : str, fps : float = 30) -> bool:
 	trim_frame_end = facefusion.globals.trim_frame_end
 	commands = [ '-hwaccel', 'auto', '-i', target_path, '-q:v', str(temp_frame_quality), '-pix_fmt', 'rgb24' ]
 	if trim_frame_start is not None and trim_frame_end is not None:
-		commands.extend(['-vf', 'trim=start_frame=' + str(trim_frame_start) + ':end_frame=' + str(trim_frame_end) + ',fps=' + str(fps)])
+		commands.extend(['-vf', 'trim=start_frame=' + str(trim_frame_start) + ':end_frame=' + str(trim_frame_end) + ',fps=' + str(fps) + ',setpts=(PTS-STARTPTS)'])
 	elif trim_frame_start is not None:
-		commands.extend(['-vf', 'trim=start_frame=' + str(trim_frame_start) + ',fps=' + str(fps)])
+		commands.extend(['-vf', 'trim=start_frame=' + str(trim_frame_start) + ',fps=' + str(fps) + ',setpts=(PTS-STARTPTS)'])
 	elif trim_frame_end is not None:
-		commands.extend(['-vf', 'trim=end_frame=' + str(trim_frame_end) + ',fps=' + str(fps)])
+		commands.extend(['-vf', 'trim=end_frame=' + str(trim_frame_end) + ',fps=' + str(fps) + ',setpts=(PTS-STARTPTS)'])
 	else:
 		commands.extend(['-vf', 'fps=' + str(fps)])
 	commands.extend([os.path.join(temp_directory_path, '%04d.' + facefusion.globals.temp_frame_format)])
@ -75,18 +75,37 @@ def create_video(target_path : str, fps : float = 30) -> bool:
 	return run_ffmpeg(commands)


-def restore_audio(target_path : str, output_path : str) -> None:
+def restore_audio(target_path : str, output_path : str, fps: int) -> None:
 	trim_frame_start = facefusion.globals.trim_frame_start
 	trim_frame_end = facefusion.globals.trim_frame_end
 	temp_output_path = get_temp_output_path(target_path)
-	commands = [ '-hwaccel', 'auto', '-i', temp_output_path, '-i', target_path ]
-	if trim_frame_start is not None and trim_frame_end is not None:
-		commands.extend([ '-filter:v', 'select=between(n,' + str(trim_frame_start) + ',' + str(trim_frame_end) + ')' ])
-	elif trim_frame_start is not None:
-		commands.extend([ '-filter:v', 'select=gt(n,' + str(trim_frame_start) + ')' ])
-	elif trim_frame_end is not None:
-		commands.extend([ '-filter:v', 'select=lt(n,' + str(trim_frame_end) + ')' ])
-	commands.extend([ '-c:a', 'copy', '-map', '0:v:0', '-map', '1:a:0', '-y', output_path ])
+
+	# Create temp audio file extracted from target video
+	temp_target_audio_path = os.path.join(os.path.dirname(target_path), Path(target_path).stem + '.mp3')
+	commands = ['-hwaccel', 'auto', '-i', target_path, '-vn', '-y', temp_target_audio_path ]
+	done = run_ffmpeg(commands)
+	if not done:
+		move_temp(target_path, output_path)
+		return
+
+	# Trim audio file
+	temp_target_audio_trimmed_path = temp_target_audio_path.replace('.mp3', '_trimmed.mp3')
+	if trim_frame_start is None:
+		trim_frame_start = 0
+	start_time_ms = round(trim_frame_start/fps, 3) * 1000
+	commands = ['-hwaccel', 'auto', '-ss', str(start_time_ms) + 'ms' ]
+	if trim_frame_end is not None:
+		end_time_ms = round(trim_frame_end/fps, 3) * 1000
+		commands.extend([ '-to', str(end_time_ms) + 'ms'])
+	commands.extend(['-i', temp_target_audio_path, '-vn', '-c', 'copy', '-y', temp_target_audio_trimmed_path])
+	print(commands)
+	done = run_ffmpeg(commands)
+	if not done:
+		move_temp(target_path, output_path)
+		return
+
+	# Add audio to temp output
+	commands = ['-hwaccel', 'auto', '-i', temp_output_path, '-i', temp_target_audio_trimmed_path, '-c:v', 'copy', '-map', '0:v', '-map', '1:a', '-y', output_path ]
 	done = run_ffmpeg(commands)
 	if not done:
 		move_temp(target_path, output_path)
--- a/facefusion/wording.py
+++ b/facefusion/wording.py
@ -7,7 +7,7 @@ WORDING =\
 	'output_help': 'specify the output file or directory',
 	'frame_processors_help': 'choose from the available frame processors (choices: {choices}, ...)',
 	'ui_layouts_help': 'choose from the available ui layouts (choices: {choices}, ...)',
-	'keep_fps_help': 'preserve the frames per second (fps) of the target',
+	'fps_cap_help': 'cap the frames per second (fps) of the target to 30',
 	'keep_temp_help': 'retain temporary frames after processing',
 	'skip_audio_help': 'omit audio from the target',
 	'face_recognition_help': 'specify the method for face recognition',
@ -68,7 +68,7 @@ WORDING =\
 	'preview_image_label': 'PREVIEW',
 	'preview_frame_slider_label': 'PREVIEW FRAME',
 	'frame_processors_checkbox_group_label': 'FRAME PROCESSORS',
-	'keep_fps_checkbox_label': 'KEEP FPS',
+	'fps_cap_checkbox_label': 'FPS CAP',
 	'keep_temp_checkbox_label': 'KEEP TEMP',
 	'skip_audio_checkbox_label': 'SKIP AUDIO',
 	'temp_frame_format_dropdown_label': 'TEMP FRAME FORMAT',