cellRec: implement audio encoding (no mixing yet)

2024-11-22 10:42:36 +01:00 · 2023-11-15 00:24:36 +01:00 · 2023-11-15 00:24:36 +01:00 · 51d0df97d3
commit 51d0df97d3
parent 4c14290694
2 changed files with 133 additions and 42 deletions
--- a/rpcs3/Emu/Cell/Modules/cellRec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellRec.cpp
@ -155,7 +155,8 @@ struct rec_param
 	}
 };
-constexpr u32 rec_framerate = 30; // Always 30 fps
+static constexpr u32 rec_framerate = 30; // Always 30 fps
 static constexpr u32 rec_channels = 2; // Always 2 channels
 class rec_video_sink : public utils::video_sink
 {
@ -219,11 +220,21 @@ struct rec_info
 	vm::bptr<u8> video_input_buffer{}; // Used by the game to inject a frame right before it would render a frame to the screen.
 	vm::bptr<u8> audio_input_buffer{}; // Used by the game to inject audio: 2-channel interleaved (left-right) * 256 samples * sizeof(f32) at 48000 kHz
 	// Wrapper for our audio data
 	struct audio_block
 	{
 		// 2-channel interleaved (left-right), 256 samples, float
 		static constexpr usz block_size = rec_channels * CELL_REC_AUDIO_BLOCK_SAMPLES * sizeof(f32);
 		std::array<u8, block_size> block{};
 		s64 pts{};
 	};
 	std::vector<utils::video_sink::encoder_frame> video_ringbuffer;
-	std::vector<u8> audio_ringbuffer;
+	std::vector<audio_block> audio_ringbuffer;
 	usz video_ring_pos = 0;
 	usz video_ring_frame_count = 0;
-	usz audio_ring_step = 0;
+	usz audio_ring_pos = 0;
 	usz audio_ring_block_count = 0;
 	usz next_video_ring_pos()
 	{
@ -232,6 +243,13 @@ struct rec_info
 		return pos;
 	}
 	usz next_audio_ring_pos()
 	{
 		const usz pos = audio_ring_pos;
 		audio_ring_pos = (audio_ring_pos + 1) % audio_ringbuffer.size();
 		return pos;
 	}
 	std::shared_ptr<rec_video_sink> ringbuffer_sink;
 	std::shared_ptr<utils::video_encoder> encoder;
 	std::unique_ptr<named_thread<std::function<void()>>> video_provider_thread;
@ -245,13 +263,13 @@ struct rec_info
 	u32 video_bps = 512000;
 	s32 video_codec_id = 12; // AV_CODEC_ID_MPEG4
 	s32 max_b_frames = 2;
-	const u32 fps = rec_framerate; // Always 30 fps
+	static constexpr u32 fps = rec_framerate; // Always 30 fps
 	// Audio parameters
 	u32 sample_rate = 48000;
 	u32 audio_bps = 64000;
 	s32 audio_codec_id = 86018; // AV_CODEC_ID_AAC
-	const u32 channels = 2; // Always 2 channels
+	static constexpr u32 channels = rec_channels; // Always 2 channels
 	// Recording duration
 	atomic_t<u64> recording_time_start = 0;
@ -588,8 +606,7 @@ void rec_info::start_video_provider()
 			}
 			// We only care for new video frames or audio samples that can be properly encoded, so we check the timestamps and pts.
-			const usz timestamp_us = get_system_time() - recording_time_start - pause_time_total;
+			const usz timestamp_ms = (get_system_time() - recording_time_start - pause_time_total) / 1000;
 			const usz timestamp_ms = timestamp_us / 1000;
 			/////////////////
 			//    VIDEO    //
@ -632,7 +649,7 @@ void rec_info::start_video_provider()
 				// The video frames originate from our render pipeline and are stored in a ringbuffer.
 				utils::video_sink::encoder_frame frame = ringbuffer_sink->get_frame();
-				if (const s64 pts = encoder->get_pts(frame.timestamp_ms); pts > last_video_pts && frame.data.size() > 0)
+				if (const s64 pts = encoder->get_pts(frame.timestamp_ms); pts > last_video_pts && !frame.data.empty())
 				{
 					ensure(frame.data.size() == frame_size);
 					utils::video_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
@ -647,34 +664,75 @@ void rec_info::start_video_provider()
 				// The video frames originate from our render pipeline and are directly encoded by the encoder video sink itself.
 			//}
-			if (use_internal_audio)
+			/////////////////
-			{
+			//    AUDIO    //
-				// TODO: fetch audio
+			/////////////////
 			}
-			if (use_external_audio && audio_input_buffer)
+			const usz timestamp_us = get_system_time() - recording_time_start - pause_time_total;
 			{
 				// 2-channel interleaved (left-right), 256 samples, float
 				std::array<f32, 2 * CELL_REC_AUDIO_BLOCK_SAMPLES> audio_data{};
 				std::memcpy(audio_data.data(), audio_input_buffer.get_ptr(), audio_data.size() * sizeof(f32));
-				// TODO: mix audio with param.audio_input_mix_vol
+			// TODO: mix external and internal audio with param.audio_input_mix_vol
-			}
+			// TODO: mix channels if necessary
 			if (use_external_audio)
 			{
 				// The audio samples originate from cellRec instead of our render pipeline.
 				// TODO: This needs to be synchronized with the game somehow if possible.
 				if (const s64 pts = encoder->get_audio_pts(timestamp_us); pts > last_audio_pts)
 				{
 					if (audio_input_buffer)
 					{
 						if (use_ring_buffer)
 						{
 							// The audio samples originate from cellRec and are stored in a ringbuffer.
 							audio_block& sample_block = audio_ringbuffer[next_audio_ring_pos()];
 							std::memcpy(sample_block.block.data(), audio_input_buffer.get_ptr(), sample_block.block.size());
 							sample_block.pts = pts;
 							audio_ring_block_count++;
 						}
 						else
 						{
 							// The audio samples originate from cellRec and are pushed to the encoder immediately.
 							encoder->add_audio_samples(audio_input_buffer.get_ptr(), CELL_REC_AUDIO_BLOCK_SAMPLES, channels, timestamp_us);
 						}
 					}
-			if (use_ring_buffer)
+					last_audio_pts = pts;
-			{
+				}
 				// TODO: add audio properly
 				//std::memcpy(&ringbuffer[get_ring_pos(pts) + ring_audio_offset], audio_data.data(), audio_data.size());
 			}
-			else
+			else if (use_ring_buffer && ringbuffer_sink && use_internal_audio)
 			{
-				// TODO: add audio to encoder
+				// The audio samples originate from cellAudio and are stored in a ringbuffer.
 				utils::video_sink::encoder_sample sample = ringbuffer_sink->get_sample();
 				if (!sample.data.empty() && sample.channels >= 2 && sample.sample_count >= CELL_REC_AUDIO_BLOCK_SAMPLES)
 				{
 					s64 pts = encoder->get_audio_pts(sample.timestamp_us);
 					// Each encoder_sample can have more than one block
 					for (usz i = 0; i < sample.sample_count; i += CELL_REC_AUDIO_BLOCK_SAMPLES)
 					{
 						if (pts > last_audio_pts)
 						{
 							audio_block& sample_block = audio_ringbuffer[next_audio_ring_pos()];
 							std::memcpy(sample_block.block.data(), &sample.data[i * channels * sizeof(f32)], sample_block.block.size());
 							sample_block.pts = pts;
 							last_audio_pts = pts;
 							audio_ring_block_count++;
 						}
 						// Increase pts for each sample block
 						pts++;
 					}
 				}
 			}
 			//else
 			//{
 				// The audio samples originate from cellAudio and are directly encoded by the encoder video sink itself.
 			//}
 			// Update recording time
 			recording_time_total = encoder->get_timestamp_ms(encoder->last_video_pts());
-			thread_ctrl::wait_for(100);
+			thread_ctrl::wait_for(1);
 		}
 	});
 }
@ -705,7 +763,7 @@ void rec_info::stop_video_provider(bool flush)
 	// Flush the ringbuffer if necessary.
 	// This should only happen if the video sink is not the encoder itself.
 	// In this case the encoder should have been idle until now.
-	if (flush && param.ring_sec > 0 && !video_ringbuffer.empty())
+	if (flush && param.ring_sec > 0 && (!video_ringbuffer.empty() || !audio_ringbuffer.empty()))
 	{
 		cellRec.notice("Flushing video ringbuffer.");
@ -714,19 +772,51 @@ void rec_info::stop_video_provider(bool flush)
 		ensure(encoder);
 		const usz frame_count = std::min(video_ringbuffer.size(), video_ring_frame_count);
-		const usz start_offset = video_ring_frame_count < video_ringbuffer.size() ? 0 : video_ring_frame_count;
+		const usz video_start_offset = video_ring_frame_count < video_ringbuffer.size() ? 0 : video_ring_frame_count;
-		const s64 start_pts = video_ringbuffer[start_offset % video_ringbuffer.size()].pts;
+		const s64 video_start_pts = video_ringbuffer.empty() ? 0 : video_ringbuffer[video_start_offset % video_ringbuffer.size()].pts;
-		for (usz i = 0; i < frame_count; i++)
+		const usz block_count = std::min(audio_ringbuffer.size(), audio_ring_block_count);
 		const usz audio_start_offset = audio_ring_block_count < audio_ringbuffer.size() ? 0 : audio_ring_block_count;
 		const s64 audio_start_pts = audio_ringbuffer.empty() ? 0 : audio_ringbuffer[audio_start_offset % audio_ringbuffer.size()].pts;
 		cellRec.error("Flushing video ringbuffer: block_count=%d, audio_ringbuffer.size=%d", block_count, audio_ringbuffer.size());
 		cellRec.error("Flushing video ringbuffer: video_start_pts=%d, audio_start_pts=%d", video_start_pts, audio_start_pts);
 		// Try to add the frames and samples in proper order
 		for (usz sync_timestamp_us = 0, frame = 0, block = 0; frame < frame_count || block < block_count; frame++)
 		{
-			const usz pos = (start_offset + i) % video_ringbuffer.size();
+			// Add one frame
-			utils::video_sink::encoder_frame& frame_data = video_ringbuffer[pos];
+			if (frame < frame_count)
-			encoder->add_frame(frame_data.data, frame_data.pitch, frame_data.width, frame_data.height, frame_data.av_pixel_format, encoder->get_timestamp_ms(frame_data.pts - start_pts));
+			{
 				const usz pos = (video_start_offset + frame) % video_ringbuffer.size();
 				utils::video_sink::encoder_frame& frame_data = video_ringbuffer[pos];
 				const usz timestamp_ms = encoder->get_timestamp_ms(frame_data.pts - video_start_pts);
 				encoder->add_frame(frame_data.data, frame_data.pitch, frame_data.width, frame_data.height, frame_data.av_pixel_format, timestamp_ms);
-			// TODO: add audio data to encoder
+				// Increase sync timestamp
 				sync_timestamp_us = timestamp_ms * 1000;
 			}
 			// Add all the samples that fit into the last frame
 			for (usz i = block; i < block_count; i++)
 			{
 				const usz pos = (audio_start_offset + i) % audio_ringbuffer.size();
 				const audio_block& sample_block = audio_ringbuffer[pos];
 				const usz timestamp_us = encoder->get_audio_timestamp_us(sample_block.pts - audio_start_pts);
 				// Stop adding new samples if the sync timestamp is exceeded, unless we already added all the frames.
 				if (timestamp_us > sync_timestamp_us && frame < frame_count)
 				{
 					break;
 				}
 				encoder->add_audio_samples(sample_block.block.data(), CELL_REC_AUDIO_BLOCK_SAMPLES, channels, timestamp_us);
 				block++;
 			}
 		}
 		video_ringbuffer.clear();
 		audio_ringbuffer.clear();
 	}
 }
@ -1093,6 +1183,8 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 	rec.cbUserData = cbUserData;
 	rec.last_video_pts = -1;
 	rec.audio_ringbuffer.clear();
 	rec.audio_ring_block_count = 0;
 	rec.audio_ring_pos = 0;
 	rec.video_ringbuffer.clear();
 	rec.video_ring_frame_count = 0;
 	rec.video_ring_pos = 0;
@ -1103,16 +1195,13 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 	if (rec.param.ring_sec > 0)
 	{
-		const u32 audio_size_per_sample = rec.channels * sizeof(float);
+		const usz audio_ring_buffer_size = static_cast<usz>(std::ceil((rec.param.ring_sec * rec.sample_rate) / static_cast<f32>(CELL_REC_AUDIO_BLOCK_SAMPLES)));
 		const u32 audio_size_per_second = rec.sample_rate * audio_size_per_sample;
 		const usz audio_ring_buffer_size = rec.param.ring_sec * audio_size_per_second;
 		const usz video_ring_buffer_size = rec.param.ring_sec * rec.fps;
 		cellRec.notice("Preparing ringbuffer for %d seconds. video_ring_buffer_size=%d, audio_ring_buffer_size=%d, pitch=%d, width=%d, height=%d", rec.param.ring_sec, video_ring_buffer_size, audio_ring_buffer_size, rec.input_format.pitch, rec.input_format.width, rec.input_format.height);
 		rec.audio_ringbuffer.resize(audio_ring_buffer_size);
-		rec.audio_ring_step = audio_size_per_sample;
+		rec.video_ringbuffer.resize(video_ring_buffer_size);
 		rec.video_ringbuffer.resize(video_ring_buffer_size, {});
 		rec.ringbuffer_sink = std::make_shared<rec_video_sink>();
 		rec.ringbuffer_sink->use_internal_audio = rec.param.use_internal_audio();
--- a/rpcs3/util/video_sink.h
+++ b/rpcs3/util/video_sink.h
@ -26,7 +26,7 @@ namespace utils
 			m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
 		}
-		void add_audio_samples(u8* buf, u32 sample_count, u16 channels, usz timestamp_us)
+		void add_audio_samples(const u8* buf, u32 sample_count, u16 channels, usz timestamp_us)
 		{
 			// Do not allow new samples while flushing
 			if (m_flush || !buf || !sample_count || !channels)
@ -51,12 +51,14 @@ namespace utils
 		usz get_timestamp_ms(s64 pts) const
 		{
-			return static_cast<usz>(std::round((pts * 1000) / static_cast<float>(m_framerate)));
+			return static_cast<usz>(std::round((pts * 1000) / static_cast<f32>(m_framerate)));
 		}
 		usz get_audio_timestamp_us(s64 pts) const
 		{
-			return static_cast<usz>(std::round((pts * 1000) / static_cast<float>(m_sample_rate)));
+			static constexpr f32 us_per_sec = 1000000.0f;
 			const f32 us_per_block = us_per_sec / (m_sample_rate / static_cast<f32>(m_samples_per_block));
 			return static_cast<usz>(pts * us_per_block);
 		}
 		atomic_t<bool> has_error{false};