RTP: fix sending of RFC-compilant OPUS stream

RTP timestamp was incorrectly computed from data_len/bps which confused VLC. + some cleanup
2026-03-20 10:40:09 +00:00 · 2018-08-20 18:21:19 +02:00
parent 86874cb8a9
commit 315749bbe5
8 changed files with 79 additions and 79 deletions
--- a/src/audio/codec/libavcodec.cpp
+++ b/src/audio/codec/libavcodec.cpp
@@ -435,6 +435,7 @@ static audio_channel *libavcodec_compress(void *state, audio_channel * channel)
 				s->output_channel.data_len += pkt.size;
 				av_packet_unref(&pkt);
 				ret = avcodec_receive_packet(s->codec_ctx, &pkt);
+                                s->output_channel.duration += s->codec_ctx->frame_size / (double) s->output_channel.sample_rate;
 			}
 			if (ret != AVERROR(EAGAIN) && ret != 0) {
 				char errbuf[1024];
@@ -460,10 +461,7 @@ static audio_channel *libavcodec_compress(void *state, audio_channel * channel)
                }
                if(got_packet) {
                        s->output_channel.data_len += pkt.size;
-                        ///@ todo
-                        /// well, this is wrong, denominator should be actually AVStream::time_base. Where do
-                        /// we get this?? Anyway, seems like it equals sample rate.
-                        s->output_channel.duration += pkt.duration / (double) s->output_channel.sample_rate;
+                        s->output_channel.duration += s->codec_ctx->frame_size / (double) s->output_channel.sample_rate;
                }
 #endif
                offset += chunk_size;
@@ -671,3 +669,4 @@ static const struct audio_compress_info libavcodec_audio_codec = {

 REGISTER_MODULE(libavcodec,  &libavcodec_audio_codec, LIBRARY_CLASS_AUDIO_COMPRESS, AUDIO_COMPRESS_ABI_VERSION);

+/* vim: set expandtab sw=8 : */
--- a/src/audio/types.h
+++ b/src/audio/types.h
@@ -185,7 +185,7 @@ private:
        int sample_rate;
        std::vector<channel> channels; /* data should be at least 4B aligned */
        audio_codec_t codec;
-        double duration; /// @note currently unused
+        double duration; ///< for compressed formats where this cannot be directly determined from samples/sample_rate
 };
 #endif // __cplusplus

--- a/src/audio/utils.cpp
+++ b/src/audio/utils.cpp
@@ -207,7 +207,7 @@ void demux_channel(char *out, char *in, int bps, int in_len, int in_stream_chann
        }
 }

-void remux_channel(char *out, char *in, int bps, int in_len, int in_stream_channels, int out_stream_channels, int pos_in_stream, int pos_out_stream)
+void remux_channel(char *out, const char *in, int bps, int in_len, int in_stream_channels, int out_stream_channels, int pos_in_stream, int pos_out_stream)
 {
        int samples = in_len / (in_stream_channels * bps);
        int i;
--- a/src/audio/utils.h
+++ b/src/audio/utils.h
@@ -89,7 +89,7 @@ void copy_channel(char *out, const char *in, int bps, int in_len /* bytes */, in
 */
 void mux_channel(char *out, const char *in, int bps, int in_len, int out_stream_channels, int chan_pos_stream, double scale);
 void demux_channel(char *out, char *in, int bps, int in_len, int in_stream_channels, int pos_in_stream);
-void remux_channel(char *out, char *in, int bps, int in_len, int in_stream_channels, int out_stream_channels, int pos_in_stream, int pos_out_stream);
+void remux_channel(char *out, const char *in, int bps, int in_len, int in_stream_channels, int out_stream_channels, int pos_in_stream, int pos_out_stream);

 void interleaved2noninterleaved(char *out, const char *in, int bps, int in_len /* bytes */, int channel_count);

--- a/src/transmit.cpp
+++ b/src/transmit.cpp
@@ -62,13 +62,14 @@
 #include "config_win32.h"
 #endif // HAVE_CONFIG_H

+#include "audio/audio.h"
+#include "audio/codec.h"
+#include "audio/utils.h"
 #include "crypto/random.h"
 #include "debug.h"
 #include "host.h"
 #include "lib_common.h"
 #include "perf.h"
-#include "audio/audio.h"
-#include "audio/codec.h"
 #include "crypto/openssl_encrypt.h"
 #include "module.h"
 #include "rtp/fec.h"
@@ -102,11 +103,6 @@

 #define DEFAULT_CIPHER_MODE MODE_AES128_CFB

-// Mulaw audio memory reservation
-#define BUFFER_MTU_SIZE 1500
-static char *data_buffer_mulaw;
-static int buffer_mulaw_init = 0;
-
 static void tx_update(struct tx *tx, struct video_frame *frame, int substream);
 static void tx_done(struct module *tx);
 static uint32_t format_interl_fps_hdr_row(enum interlacing_t interlacing, double input_fps);
@@ -147,16 +143,9 @@ struct tx {
        long long int bitrate;
 		
        struct rtpenc_h264_state *rtpenc_h264_state;
+        char tmp_packet[RTP_MAX_MTU];
 };

-// Mulaw audio memory reservation
-static void init_tx_mulaw_buffer() {
-    if (!buffer_mulaw_init) {
-        data_buffer_mulaw = (char *) malloc(BUFFER_MTU_SIZE*20);
-        buffer_mulaw_init = 1;
-    }
-}
-
 static void tx_update(struct tx *tx, struct video_frame *frame, int substream)
 {
        if(!frame) {
@@ -876,7 +865,7 @@ void audio_tx_send(struct tx* tx, struct rtp *rtp_session, const audio_frame2 *
        tx->buffer ++;
 }

-/*
+/**
 * audio_tx_send_standard - Send interleaved channels from the audio_frame2,
 *                       	as the mulaw and A-law standards (dynamic or std PT).
 */
@@ -908,47 +897,50 @@ void audio_tx_send_standard(struct tx* tx, struct rtp *rtp_session,
 		assert(buffer->get_data_len(0) == buffer->get_data_len(i));

 	int data_len = buffer->get_data_len(0) * buffer->get_channel_count(); 	/* Number of samples to send 			*/
-	int payload_size = tx->mtu - 40; 						/* Max size of an RTP payload field 	*/
+	int payload_size = tx->mtu - 40 - 8 - 12; /* Max size of an RTP payload field (minus IPv6, UDP and RTP header lengths) */

-	init_tx_mulaw_buffer();
-	char *curr_sample = data_buffer_mulaw;
-	int ch, pos = 0, count = 0, pointerToSend = 0;
+        if (buffer->get_codec() == AC_OPUS) { // OPUS needs to fit one package
+                if (payload_size < data_len) {
+                        log_msg(LOG_LEVEL_ERROR, "Transmit: OPUS frame larger than packet! Discarding...\n");
+                        return;
+                }
+        } else { // we may split the data into more packets, compute chunk size
+                int frame_size = buffer->get_channel_count() * buffer->get_bps();
+                payload_size = payload_size / frame_size * frame_size; // align to frame size
+        }

+	int pos = 0;
 	do {
-		for (ch = 0; ch < buffer->get_channel_count(); ch++) {
-			memcpy(curr_sample, buffer->get_data(ch) + pos,
-					buffer->get_bps() * sizeof(char));
-			curr_sample += buffer->get_bps() * sizeof(char);
-			count += buffer->get_bps() * sizeof(char);
-		}
-		pos += buffer->get_bps() * sizeof(char);
+                int pkt_len = std::min(payload_size, data_len - pos);

-		if ((pos * buffer->get_channel_count()) % payload_size == 0) {
-			// Update first sample timestamp
-			ts =	get_std_audio_local_mediatime((double)payload_size / (double)buffer->get_channel_count());
-			gettimeofday(&curr_time, NULL);
-			rtp_send_ctrl(rtp_session, ts_prev, 0, curr_time); //send RTCP SR
-			ts_prev = ts;
-			// Send the packet
-			rtp_send_data(rtp_session, ts, pt, 0, 0, /* contributing sources 		*/
-			0, 												/* contributing sources length 	*/
-			data_buffer_mulaw + pointerToSend, payload_size, 0, 0, 0);
-			pointerToSend += payload_size;
-		}
-	} while (count < data_len);
+                // interleave
+                if (buffer->get_codec() == AC_OPUS) {
+                        assert(buffer->get_channel_count() == 1); // we cannot interleave OPUS here
+                        memcpy(tx->tmp_packet, buffer->get_data(0), pkt_len);
+                } else {
+                        for (int ch = 0; ch < buffer->get_channel_count(); ch++) {
+                                remux_channel(tx->tmp_packet, buffer->get_data(ch) + pos / buffer->get_channel_count(), buffer->get_bps(), pkt_len / buffer->get_channel_count(), 1, buffer->get_channel_count(), 0, ch);
+                        }
+                }

-	if ((pos * buffer->get_channel_count()) % payload_size != 0) {
-		// Update first sample timestamp
-		ts =	get_std_audio_local_mediatime((double)((pos * buffer->get_channel_count()) % payload_size) / (double)buffer->get_channel_count());
-		gettimeofday(&curr_time, NULL);
-		rtp_send_ctrl(rtp_session, ts_prev, 0, curr_time); //send RTCP SR
-		ts_prev = ts;
-		// Send the packet
-		rtp_send_data(rtp_session, ts, pt, 0, 0, 	/* contributing sources 		*/
-		0, 													/* contributing sources length 	*/
-		data_buffer_mulaw + pointerToSend,
-				(pos * buffer->get_channel_count()) % payload_size, 0, 0, 0);
-	}
+                // Update first sample timestamp
+                if (buffer->get_codec() == AC_OPUS) {
+                        /* OPUS packet will be the whole contained in one packet
+                         * according to RFC 7587. For PCMA/PCMU there may be more
+                         * packets so we cannot use the whole frame duration. */
+                        ts = get_std_audio_local_mediatime(buffer->get_duration(), 48000);
+                } else {
+                        ts = get_std_audio_local_mediatime((double) pkt_len / (double) buffer->get_channel_count() / (double) buffer->get_sample_rate(), buffer->get_sample_rate());
+                }
+                gettimeofday(&curr_time, NULL);
+                rtp_send_ctrl(rtp_session, ts_prev, 0, curr_time); //send RTCP SR
+                ts_prev = ts;
+                // Send the packet
+                rtp_send_data(rtp_session, ts, pt, 0, 0, /* contributing sources 		*/
+                                0, 												/* contributing sources length 	*/
+                                tx->tmp_packet, pkt_len, 0, 0, 0);
+                pos += pkt_len;
+	} while (pos < data_len);
 }

 /**
--- a/src/tv.c
+++ b/src/tv.c
@@ -148,7 +148,11 @@ typedef struct { //shared struct for audio and video streams (sync.)

 std_time_struct standard_time = { true, 0, { 0, 0 }, 25, { 0, 0 }, { 0, 0 } };

-uint32_t get_std_audio_local_mediatime(double samples)
+/**
+ * @param samples       number of samples in unit of seconds
+ * @param rate          RTP timestamp scale (usually sample rate, but for OPUS always 48000)
+ */
+uint32_t get_std_audio_local_mediatime(double samples, int rate)
 {
        if (standard_time.init) {
 			gettimeofday(&standard_time.start_time, NULL);
@@ -164,7 +168,7 @@ uint32_t get_std_audio_local_mediatime(double samples)
            tv_add(&standard_time.atime, samples);
        }

-        return (double)standard_time.atime.tv_sec + (((double)standard_time.atime.tv_usec) / 1000000.0);
+        return ((double)standard_time.atime.tv_sec + (((double)standard_time.atime.tv_usec) / 1000000.0)) * rate;
 }

 uint32_t get_std_video_local_mediatime(void)
--- a/src/tv.h
+++ b/src/tv.h
@@ -54,7 +54,7 @@ uint32_t tv_diff_usec(struct timeval curr_time, struct timeval prev_time);
 void     tv_add(struct timeval *ts, double offset_secs);
 void     tv_add_usec(struct timeval *ts, double offset);
 int      tv_gt(struct timeval a, struct timeval b);
-uint32_t get_std_audio_local_mediatime(double samples);
+uint32_t get_std_audio_local_mediatime(double samples, int rate);
 uint32_t get_std_video_local_mediatime(void);

 #ifdef __cplusplus
--- a/src/video_rxtx/h264_sdp.cpp
+++ b/src/video_rxtx/h264_sdp.cpp
@@ -50,6 +50,7 @@
 #include "lib_common.h"
 #include "transmit.h"
 #include "rtp/rtp.h"
+#include "rtp/rtp_callback.h" // PCMA/PCMU packet types
 #include "rtp/rtpenc_h264.h"
 #include "utils/sdp.h"
 #include "video_rxtx.h"
@@ -65,24 +66,28 @@ h264_sdp_video_rxtx::h264_sdp_video_rxtx(std::map<std::string, param_u> const &p
        m_sdp = new_sdp(std_H264, params.at("tx_port").i);
        if (params.at("a_tx_port").i) {
                new_stream(m_sdp);
-                sprintf(m_sdp->stream[1].media_info, "m=audio %d RTP/AVP 97\n", params.at("a_tx_port").i);
-                const char *audio_codec = NULL;
-                switch (params.at("audio_codec").l) {
-                        case AC_ALAW:
-                                audio_codec = "PCMA";
-                                break;
-                        case AC_MULAW:
-                                audio_codec = "PCMU";
-                                break;
-                        case AC_OPUS:
-                                audio_codec = "OPUS";
-                                break;
+                if (params.at("audio_sample_rate").i == 8000 && params.at("audio_channels").i == 1 && (params.at("audio_codec").l ==  AC_ALAW || params.at("audio_codec").l == AC_MULAW)) {
+                        sprintf(m_sdp->stream[1].media_info, "m=audio %d RTP/AVP %d\n", params.at("a_tx_port").i, params.at("audio_codec").l ==  AC_MULAW ? PT_ITU_T_G711_PCMU : PT_ITU_T_G711_PCMA);
+                } else {
+                        sprintf(m_sdp->stream[1].media_info, "m=audio %d RTP/AVP 97\n", params.at("a_tx_port").i);
+                        const char *audio_codec = NULL;
+                        switch (params.at("audio_codec").l) {
+                                case AC_ALAW:
+                                        audio_codec = "PCMA";
+                                        break;
+                                case AC_MULAW:
+                                        audio_codec = "PCMU";
+                                        break;
+                                case AC_OPUS:
+                                        audio_codec = "OPUS";
+                                        break;
+                        }
+
+                        assert(audio_codec);
+
+                        sprintf(m_sdp->stream[1].rtpmap, "a=rtpmap:97 %s/%i/%i", audio_codec,
+                                        params.at("audio_codec").l == AC_OPUS ? 48000 : params.at("audio_sample_rate").i, params.at("audio_channels").i);
                }
-
-                assert(audio_codec);
-
-                sprintf(m_sdp->stream[1].rtpmap, "a=rtpmap:97 %s/%i/%i", audio_codec,
-                                params.at("audio_sample_rate").i, params.at("audio_channels").i);
        }
        if (m_sdp == NULL) {
                throw string("[SDP] SDP creation failed\n");