OpenShot Library | libopenshot 0.5.0
Loading...
Searching...
No Matches
AudioWaveformer.cpp
Go to the documentation of this file.
1
9// Copyright (c) 2008-2022 OpenShot Studios, LLC
10//
11// SPDX-License-Identifier: LGPL-3.0-or-later
12
13#include "AudioWaveformer.h"
14
15#include <cmath>
16
17#include <algorithm>
18#include <chrono>
19#include <memory>
20#include <thread>
21#include <vector>
22
23#include "Clip.h"
24#include "Exceptions.h"
25#include "FrameMapper.h"
26#include "FFmpegReader.h"
27#include "Timeline.h"
28
29
30using namespace std;
31using namespace openshot;
32
33
34// Default constructor
36 reader(new_reader),
37 detached_reader(nullptr),
38 resolved_reader(nullptr),
39 source_initialized(false)
40{
41
42}
43
44// Destructor
49
50// Extract audio samples from any ReaderBase class
51AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
52 // Legacy entry point: resolve a source reader (unwrap Clip/FrameMapper), then extract audio-only.
54 if (!reader) {
55 return data;
56 }
57
58 ReaderBase* source = ResolveWaveformReader();
59
60 Fraction source_fps = ResolveSourceFPS(source);
61
62 AudioWaveformData base = ExtractSamplesFromReader(source, channel, num_per_second, false);
63
64 // If this is a Clip, apply its keyframes using project fps (timeline if available, else reader fps)
65 if (auto clip = dynamic_cast<Clip*>(reader)) {
66 Timeline* timeline = dynamic_cast<Timeline*>(clip->ParentTimeline());
67 Fraction project_fps = timeline ? timeline->info.fps : clip->Reader()->info.fps;
68 return ApplyKeyframes(base, &clip->time, &clip->volume, project_fps, source_fps, source->info.channels, num_per_second, channel, normalize);
69 }
70
71 // No keyframes to apply
72 if (normalize) {
73 float max_sample = 0.0f;
74 for (auto v : base.max_samples) {
75 max_sample = std::max(max_sample, std::abs(v));
76 }
77 if (max_sample > 0.0f) {
78 base.scale(static_cast<int>(base.max_samples.size()), 1.0f / max_sample);
79 }
80 }
81 return base;
82}
83
84AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path, int channel, int num_per_second, bool normalize) {
85 FFmpegReader temp_reader(path);
86 temp_reader.Open();
87 // Disable video for speed
88 bool has_video = temp_reader.info.has_video;
89 temp_reader.info.has_video = false;
90 AudioWaveformData data = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, normalize);
91 temp_reader.info.has_video = has_video;
92 temp_reader.Close();
93 return data;
94}
95
97 const Keyframe* time_keyframe,
98 const Keyframe* volume_keyframe,
99 const Fraction& project_fps,
100 int channel,
101 int num_per_second,
102 bool normalize) {
103 FFmpegReader temp_reader(path);
104 temp_reader.Open();
105 bool has_video = temp_reader.info.has_video;
106 temp_reader.info.has_video = false;
107 Fraction source_fps = temp_reader.info.fps;
108 AudioWaveformData base = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, false);
109 temp_reader.info.has_video = has_video;
110 temp_reader.Close();
111 return ApplyKeyframes(base, time_keyframe, volume_keyframe, project_fps, source_fps, temp_reader.info.channels, num_per_second, channel, normalize);
112}
113
115 const Keyframe* time_keyframe,
116 const Keyframe* volume_keyframe,
117 const Fraction& project_fps,
118 const Fraction& source_fps,
119 int source_channels,
120 int num_per_second,
121 int channel,
122 bool normalize) {
124 if (num_per_second <= 0) {
125 return data;
126 }
127
128 double project_fps_value = project_fps.ToDouble();
129 double source_fps_value = source_fps.ToDouble();
130 if (project_fps_value <= 0.0 || source_fps_value <= 0.0) {
131 return data;
132 }
133
134 if (channel != -1 && (channel < 0 || channel >= source_channels)) {
135 return data;
136 }
137
138 size_t base_total = base.max_samples.size();
139 if (base_total == 0) {
140 return data;
141 }
142
143 // Determine output duration from time curve (if any). Time curves are in project-frame domain.
144 int64_t output_frames = 0;
145 if (time_keyframe && time_keyframe->GetCount() > 0) {
146 output_frames = time_keyframe->GetLength();
147 }
148 if (output_frames <= 0) {
149 // Default to source duration derived from base waveform length
150 double source_duration = static_cast<double>(base_total) / static_cast<double>(num_per_second);
151 output_frames = static_cast<int64_t>(std::llround(source_duration * project_fps_value));
152 }
153 double output_duration_seconds = static_cast<double>(output_frames) / project_fps_value;
154 int total_samples = static_cast<int>(std::ceil(output_duration_seconds * num_per_second));
155
156 if (total_samples <= 0) {
157 return data;
158 }
159
160 data.resize(total_samples);
161 data.zero(total_samples);
162
163 for (int i = 0; i < total_samples; ++i) {
164 double out_time = static_cast<double>(i) / static_cast<double>(num_per_second);
165 // Time keyframes are defined in project-frame domain; evaluate using project frames
166 double project_frame = out_time * project_fps_value;
167 double mapped_project_frame = time_keyframe ? time_keyframe->GetValue(project_frame) : project_frame;
168 // Convert mapped project frame to seconds (project FPS), then to waveform index
169 double source_time = mapped_project_frame / project_fps_value;
170 double source_index = source_time * static_cast<double>(num_per_second);
171
172 // Sample base waveform (nearest with simple linear blend)
173 int idx0 = static_cast<int>(std::floor(source_index));
174 int idx1 = idx0 + 1;
175 double frac = source_index - static_cast<double>(idx0);
176
177 float max_sample = 0.0f;
178 float rms_sample = 0.0f;
179 if (idx0 >= 0 && idx0 < static_cast<int>(base_total)) {
180 max_sample = base.max_samples[idx0];
181 rms_sample = base.rms_samples[idx0];
182 }
183 if (idx1 >= 0 && idx1 < static_cast<int>(base_total)) {
184 max_sample = static_cast<float>((1.0 - frac) * max_sample + frac * base.max_samples[idx1]);
185 rms_sample = static_cast<float>((1.0 - frac) * rms_sample + frac * base.rms_samples[idx1]);
186 }
187
188 double gain = 1.0;
189 if (volume_keyframe) {
190 double project_frame = out_time * project_fps_value;
191 gain = volume_keyframe->GetValue(project_frame);
192 }
193 max_sample = static_cast<float>(max_sample * gain);
194 rms_sample = static_cast<float>(rms_sample * gain);
195
196 data.max_samples[i] = max_sample;
197 data.rms_samples[i] = rms_sample;
198 }
199
200 if (normalize) {
201 float samples_max = 0.0f;
202 for (auto v : data.max_samples) {
203 samples_max = std::max(samples_max, std::abs(v));
204 }
205 if (samples_max > 0.0f) {
206 data.scale(total_samples, 1.0f / samples_max);
207 }
208 }
209
210 return data;
211}
212
213AudioWaveformData AudioWaveformer::ExtractSamplesFromReader(ReaderBase* source_reader, int channel, int num_per_second, bool normalize) {
215
216 if (!source_reader || num_per_second <= 0) {
217 return data;
218 }
219
220 // Open reader (if needed)
221 if (!source_reader->IsOpen()) {
222 source_reader->Open();
223 }
224
225 const auto retry_delay = std::chrono::milliseconds(100);
226 const auto max_wait_for_open = std::chrono::milliseconds(3000);
227
228 auto get_frame_with_retry = [&](int64_t frame_number) -> std::shared_ptr<openshot::Frame> {
229 std::chrono::steady_clock::time_point wait_start;
230 bool waiting_for_open = false;
231 while (true) {
232 try {
233 return source_reader->GetFrame(frame_number);
234 } catch (const openshot::ReaderClosed&) {
235 auto now = std::chrono::steady_clock::now();
236 if (!waiting_for_open) {
237 waiting_for_open = true;
238 wait_start = now;
239 } else if (now - wait_start >= max_wait_for_open) {
240 throw;
241 }
242
243 std::this_thread::sleep_for(retry_delay);
244 }
245 }
246 };
247
248 int sample_rate = source_reader->info.sample_rate;
249 if (sample_rate <= 0) {
250 sample_rate = num_per_second;
251 }
252 int sample_divisor = sample_rate / num_per_second;
253 if (sample_divisor <= 0) {
254 sample_divisor = 1;
255 }
256
257 // Determine length of video frames (for waveform)
258 int64_t reader_video_length = source_reader->info.video_length;
259 if (reader_video_length < 0) {
260 reader_video_length = 0;
261 }
262 float reader_duration = source_reader->info.duration;
263 double fps_value = source_reader->info.fps.ToDouble();
264 float frames_duration = 0.0f;
265 if (reader_video_length > 0 && fps_value > 0.0) {
266 frames_duration = static_cast<float>(reader_video_length / fps_value);
267 }
268 if (reader_duration <= 0.0f) {
269 reader_duration = frames_duration;
270 }
271 if (reader_duration < 0.0f) {
272 reader_duration = 0.0f;
273 }
274
275 if (!source_reader->info.has_audio) {
276 return data;
277 }
278
279 int total_samples = static_cast<int>(std::ceil(reader_duration * num_per_second));
280 if (total_samples <= 0 || source_reader->info.channels == 0) {
281 return data;
282 }
283
284 if (channel != -1 && (channel < 0 || channel >= source_reader->info.channels)) {
285 return data;
286 }
287
288 // Resize and clear audio buffers
289 data.resize(total_samples);
290 data.zero(total_samples);
291
292 int extracted_index = 0;
293 int sample_index = 0;
294 float samples_max = 0.0f;
295 float chunk_max = 0.0f;
296 double chunk_squared_sum = 0.0;
297
298 int channel_count = (channel == -1) ? source_reader->info.channels : 1;
299 std::vector<float*> channels(source_reader->info.channels, nullptr);
300
301 try {
302 for (int64_t f = 1; f <= reader_video_length && extracted_index < total_samples; f++) {
303 std::shared_ptr<openshot::Frame> frame = get_frame_with_retry(f);
304
305 for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
306 if (channel == channel_index || channel == -1) {
307 channels[channel_index] = frame->GetAudioSamples(channel_index);
308 }
309 }
310
311 int sample_count = frame->GetAudioSamplesCount();
312 for (int s = 0; s < sample_count; s++) {
313 for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
314 if (channel == channel_index || channel == -1) {
315 float *samples = channels[channel_index];
316 if (!samples) {
317 continue;
318 }
319 float abs_sample = std::abs(samples[s]);
320 chunk_squared_sum += static_cast<double>(samples[s]) * static_cast<double>(samples[s]);
321 chunk_max = std::max(chunk_max, abs_sample);
322 }
323 }
324
325 sample_index += 1;
326
327 if (sample_index % sample_divisor == 0) {
328 float avg_squared_sum = 0.0f;
329 if (channel_count > 0) {
330 avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_divisor * channel_count));
331 }
332
333 if (extracted_index < total_samples) {
334 data.max_samples[extracted_index] = chunk_max;
335 data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
336 samples_max = std::max(samples_max, chunk_max);
337 extracted_index++;
338 }
339
340 sample_index = 0;
341 chunk_max = 0.0f;
342 chunk_squared_sum = 0.0;
343
344 if (extracted_index >= total_samples) {
345 break;
346 }
347 }
348 }
349 }
350 } catch (...) {
351 throw;
352 }
353
354 if (sample_index > 0 && extracted_index < total_samples) {
355 float avg_squared_sum = 0.0f;
356 if (channel_count > 0) {
357 avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_index * channel_count));
358 }
359
360 data.max_samples[extracted_index] = chunk_max;
361 data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
362 samples_max = std::max(samples_max, chunk_max);
363 extracted_index++;
364 }
365
366 if (normalize && samples_max > 0.0f) {
367 float scale = 1.0f / samples_max;
368 data.scale(total_samples, scale);
369 }
370
371 return data;
372}
373
374ReaderBase* AudioWaveformer::ResolveSourceReader(ReaderBase* source_reader) {
375 if (!source_reader) {
376 return nullptr;
377 }
378
379 ReaderBase* current = source_reader;
380 while (true) {
381 if (auto clip = dynamic_cast<Clip*>(current)) {
382 current = clip->Reader();
383 continue;
384 }
385 if (auto mapper = dynamic_cast<FrameMapper*>(current)) {
386 current = mapper->Reader();
387 continue;
388 }
389 break;
390 }
391 return current;
392}
393
394Fraction AudioWaveformer::ResolveSourceFPS(ReaderBase* source_reader) {
395 if (!source_reader) {
396 return Fraction(0, 1);
397 }
398 return source_reader->info.fps;
399}
400
401// Resolve and cache the reader used for waveform extraction (prefer a detached FFmpegReader clone)
402ReaderBase* AudioWaveformer::ResolveWaveformReader() {
403 if (source_initialized) {
404 return resolved_reader ? resolved_reader : reader;
405 }
406 source_initialized = true;
407
408 resolved_reader = ResolveSourceReader(reader);
409
410 // Prefer a detached, audio-only FFmpegReader clone so we never mutate the live reader used for preview.
411 if (auto ff_reader = dynamic_cast<FFmpegReader*>(resolved_reader)) {
412 const Json::Value ff_json = ff_reader->JsonValue();
413 const std::string path = ff_json.get("path", "").asString();
414 if (!path.empty()) {
415 try {
416 auto clone = std::make_unique<FFmpegReader>(path, false);
417 clone->SetJsonValue(ff_json);
418 clone->info.has_video = false; // explicitly audio-only for waveform extraction
419 detached_reader = std::move(clone);
420 resolved_reader = detached_reader.get();
421 } catch (...) {
422 // Fall back to using the original reader if cloning fails
423 detached_reader.reset();
424 resolved_reader = ResolveSourceReader(reader);
425 }
426 }
427 }
428
429 return resolved_reader ? resolved_reader : reader;
430}
Header file for AudioWaveformer class.
Header file for Clip class.
Header file for all Exception classes.
Header file for FFmpegReader class.
Header file for the FrameMapper class.
Header file for Timeline class.
AudioWaveformer(ReaderBase *reader)
Default constructor.
AudioWaveformData ApplyKeyframes(const AudioWaveformData &base, const openshot::Keyframe *time_keyframe, const openshot::Keyframe *volume_keyframe, const openshot::Fraction &project_fps, const openshot::Fraction &source_fps, int source_channels, int num_per_second, int channel, bool normalize)
Apply time and volume keyframes to an existing waveform data set.
AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize)
Extract audio samples from any ReaderBase class (legacy overload, now delegates to audio-only path)
This class represents a clip (used to arrange readers on the timeline)
Definition Clip.h:89
This class uses the FFmpeg libraries, to open video files and audio files, and return openshot::Frame...
void Open() override
Open File - which is called by the constructor automatically.
void Close() override
Close File.
This class represents a fraction.
Definition Fraction.h:30
double ToDouble() const
Return this fraction as a double (i.e. 1/2 = 0.5)
Definition Fraction.cpp:40
This class creates a mapping between 2 different frame rates, applying a specific pull-down technique...
A Keyframe is a collection of Point instances, which is used to vary a number or property over time.
Definition KeyFrame.h:53
int64_t GetLength() const
Definition KeyFrame.cpp:417
double GetValue(int64_t index) const
Get the value at a specific index.
Definition KeyFrame.cpp:258
int64_t GetCount() const
Get the number of points (i.e. # of points)
Definition KeyFrame.cpp:424
This abstract class is the base class, used by all readers in libopenshot.
Definition ReaderBase.h:76
virtual bool IsOpen()=0
Determine if reader is open or closed.
openshot::ReaderInfo info
Information about the current media file.
Definition ReaderBase.h:88
virtual void Open()=0
Open the reader (and start consuming resources, such as images or video files)
virtual std::shared_ptr< openshot::Frame > GetFrame(int64_t number)=0
Exception when a reader is closed, and a frame is requested.
Definition Exceptions.h:364
This class represents a timeline.
Definition Timeline.h:154
This namespace is the default namespace for all code in the openshot library.
Definition Compressor.h:29
This struct holds the extracted waveform data (both the RMS root-mean-squared average,...
void resize(int total_samples)
Resize both datasets.
std::vector< float > rms_samples
std::vector< float > max_samples
void zero(int total_samples)
Zero out # of values in both datasets.
void scale(int total_samples, float factor)
Scale # of values by some factor.
float duration
Length of time (in seconds)
Definition ReaderBase.h:43
int channels
The number of audio channels used in the audio stream.
Definition ReaderBase.h:61
openshot::Fraction fps
Frames per second, as a fraction (i.e. 24/1 = 24 fps)
Definition ReaderBase.h:48
int64_t video_length
The number of frames in the video stream.
Definition ReaderBase.h:53
bool has_video
Determines if this file has a video stream.
Definition ReaderBase.h:40
bool has_audio
Determines if this file has an audio stream.
Definition ReaderBase.h:41
int sample_rate
The number of audio samples per second (44100 is a common sample rate)
Definition ReaderBase.h:60