feat(ee): Add transcription support for audio messages (#11670)

<img width="419" alt="Screenshot 2025-06-03 at 4 25 37 PM" src="https://github.com/user-attachments/assets/4b6ddd11-9b91-4981-a571-83746cc4d40b" /> Fixes https://github.com/chatwoot/chatwoot/issues/10182 --------- Co-authored-by: Sojan Jose <sojan@pepalo.com>
2025-11-02 20:18:08 +00:00 · 2025-06-05 18:29:37 -05:00
parent 273c277d47
commit 8bc00f707b
14 changed files with 389 additions and 51 deletions
--- a/app/controllers/api/v1/accounts_controller.rb
+++ b/app/controllers/api/v1/accounts_controller.rb
@@ -92,7 +92,7 @@ class Api::V1::AccountsController < Api::BaseController
  end

  def settings_params
-    params.permit(:auto_resolve_after, :auto_resolve_message, :auto_resolve_ignore_waiting, :auto_resolve_label)
+    params.permit(:auto_resolve_after, :auto_resolve_message, :auto_resolve_ignore_waiting, :audio_transcriptions, :auto_resolve_label)
  end

  def check_signup_enabled
--- a/app/javascript/dashboard/components-next/message/chips/Audio.vue
+++ b/app/javascript/dashboard/components-next/message/chips/Audio.vue
@@ -109,49 +109,58 @@ const downloadAudio = async () => {
  </audio>
  <div
    v-bind="$attrs"
-    class="rounded-xl w-full gap-1 p-1.5 bg-n-alpha-white flex items-center border border-n-container shadow-[0px_2px_8px_0px_rgba(94,94,94,0.06)]"
+    class="rounded-xl w-full gap-2 p-1.5 bg-n-alpha-white flex flex-col items-center border border-n-container shadow-[0px_2px_8px_0px_rgba(94,94,94,0.06)]"
  >
-    <button class="p-0 border-0 size-8" @click="playOrPause">
-      <Icon
-        v-if="isPlaying"
-        class="size-8"
-        icon="i-teenyicons-pause-small-solid"
-      />
-      <Icon v-else class="size-8" icon="i-teenyicons-play-small-solid" />
-    </button>
-    <div class="tabular-nums text-xs">
-      {{ formatTime(currentTime) }} / {{ formatTime(duration) }}
+    <div class="flex gap-1 w-full flex-1 items-center justify-start">
+      <button class="p-0 border-0 size-8" @click="playOrPause">
+        <Icon
+          v-if="isPlaying"
+          class="size-8"
+          icon="i-teenyicons-pause-small-solid"
+        />
+        <Icon v-else class="size-8" icon="i-teenyicons-play-small-solid" />
+      </button>
+      <div class="tabular-nums text-xs">
+        {{ formatTime(currentTime) }} / {{ formatTime(duration) }}
+      </div>
+      <div class="flex-1 items-center flex px-2">
+        <input
+          type="range"
+          min="0"
+          :max="duration"
+          :value="currentTime"
+          class="w-full h-1 bg-n-slate-12/40 rounded-lg appearance-none cursor-pointer accent-current"
+          @input="seek"
+        />
+      </div>
+      <button
+        class="border-0 w-10 h-6 grid place-content-center bg-n-alpha-2 hover:bg-alpha-3 rounded-2xl"
+        @click="changePlaybackSpeed"
+      >
+        <span class="text-xs text-n-slate-11 font-medium">
+          {{ playbackSpeedLabel }}
+        </span>
+      </button>
+      <button
+        class="p-0 border-0 size-8 grid place-content-center"
+        @click="toggleMute"
+      >
+        <Icon v-if="isMuted" class="size-4" icon="i-lucide-volume-off" />
+        <Icon v-else class="size-4" icon="i-lucide-volume-2" />
+      </button>
+      <button
+        class="p-0 border-0 size-8 grid place-content-center"
+        @click="downloadAudio"
+      >
+        <Icon class="size-4" icon="i-lucide-download" />
+      </button>
    </div>
-    <div class="flex-1 items-center flex px-2">
-      <input
-        type="range"
-        min="0"
-        :max="duration"
-        :value="currentTime"
-        class="w-full h-1 bg-n-slate-12/40 rounded-lg appearance-none cursor-pointer accent-current"
-        @input="seek"
-      />
+
+    <div
+      v-if="attachment.transcribedText"
+      class="text-n-slate-12 p-3 text-sm bg-n-alpha-1 rounded-lg w-full break-words"
+    >
+      {{ attachment.transcribedText }}
    </div>
-    <button
-      class="border-0 w-10 h-6 grid place-content-center bg-n-alpha-2 hover:bg-alpha-3 rounded-2xl"
-      @click="changePlaybackSpeed"
-    >
-      <span class="text-xs text-n-slate-11 font-medium">
-        {{ playbackSpeedLabel }}
-      </span>
-    </button>
-    <button
-      class="p-0 border-0 size-8 grid place-content-center"
-      @click="toggleMute"
-    >
-      <Icon v-if="isMuted" class="size-4" icon="i-lucide-volume-off" />
-      <Icon v-else class="size-4" icon="i-lucide-volume-2" />
-    </button>
-    <button
-      class="p-0 border-0 size-8 grid place-content-center"
-      @click="downloadAudio"
-    >
-      <Icon class="size-4" icon="i-lucide-download" />
-    </button>
  </div>
 </template>
--- a/app/javascript/dashboard/i18n/locale/en/generalSettings.json
+++ b/app/javascript/dashboard/i18n/locale/en/generalSettings.json
@@ -92,6 +92,32 @@
        "PLACEHOLDER": "Your company's support email",
        "ERROR": ""
      },
+      "AUTO_RESOLVE_IGNORE_WAITING": {
+        "LABEL": "Exclude unattended conversations",
+        "HELP": "When enabled, the system will skip resolving conversations that are still waiting for an agent's reply."
+      },
+      "AUDIO_TRANSCRIPTION": {
+        "TITLE": "Transcribe Audio Messages",
+        "NOTE": "Automatically transcribe audio messages in conversations. Generate a text transcript whenever an audio message is sent or received, and display it alongside the message.",
+        "API": {
+          "SUCCESS": "Audio transcription setting updated successfully",
+          "ERROR": "Failed to update audio transcription setting"
+        }
+      },
+      "AUTO_RESOLVE_DURATION": {
+        "LABEL": "Inactivity duration for resolution",
+        "HELP": "Duration after a conversation should auto resolve if there is no activity",
+        "PLACEHOLDER": "30",
+        "ERROR": "Auto resolve duration should be between 10 minutes and 999 days",
+        "API": {
+          "SUCCESS": "Auto resolve settings updated successfully",
+          "ERROR": "Failed to update auto resolve settings"
+        },
+        "UPDATE_BUTTON": "Update",
+        "MESSAGE_LABEL": "Custom resolution message",
+        "MESSAGE_PLACEHOLDER": "Conversation was marked resolved by system due to 15 days of inactivity",
+        "MESSAGE_HELP": "This message is sent to the customer when a conversation is automatically resolved by the system due to inactivity."
+      },
      "FEATURES": {
        "INBOUND_EMAIL_ENABLED": "Conversation continuity with emails is enabled for your account.",
        "CUSTOM_EMAIL_DOMAIN_ENABLED": "You can receive emails in your custom domain now."
--- a/app/javascript/dashboard/routes/dashboard/settings/account/Index.vue
+++ b/app/javascript/dashboard/routes/dashboard/settings/account/Index.vue
@@ -16,6 +16,7 @@ import AccountId from './components/AccountId.vue';
 import BuildInfo from './components/BuildInfo.vue';
 import AccountDelete from './components/AccountDelete.vue';
 import AutoResolve from './components/AutoResolve.vue';
+import AudioTranscription from './components/AudioTranscription.vue';
 import SectionLayout from './components/SectionLayout.vue';

 export default {
@@ -26,6 +27,7 @@ export default {
    BuildInfo,
    AccountDelete,
    AutoResolve,
+    AudioTranscription,
    SectionLayout,
    WithLabel,
    NextInput,
@@ -235,6 +237,7 @@ export default {
      <woot-loading-state v-if="uiFlags.isFetchingItem" />
    </div>
    <AutoResolve v-if="showAutoResolutionConfig" />
+    <AudioTranscription v-if="isOnChatwootCloud" />
    <AccountId />
    <div v-if="!uiFlags.isFetchingItem && isOnChatwootCloud">
      <AccountDelete />
--- a/app/javascript/dashboard/routes/dashboard/settings/account/components/AudioTranscription.vue
+++ b/app/javascript/dashboard/routes/dashboard/settings/account/components/AudioTranscription.vue
@@ -0,0 +1,51 @@
+<script setup>
+import { ref, watch } from 'vue';
+import { useI18n } from 'vue-i18n';
+import { useAccount } from 'dashboard/composables/useAccount';
+import { useAlert } from 'dashboard/composables';
+import SectionLayout from './SectionLayout.vue';
+import Switch from 'next/switch/Switch.vue';
+
+const { t } = useI18n();
+const isEnabled = ref(false);
+
+const { currentAccount, updateAccount } = useAccount();
+
+watch(
+  currentAccount,
+  () => {
+    const { audio_transcriptions } = currentAccount.value?.settings || {};
+    isEnabled.value = !!audio_transcriptions;
+  },
+  { deep: true, immediate: true }
+);
+
+const updateAccountSettings = async settings => {
+  try {
+    await updateAccount(settings);
+    useAlert(t('GENERAL_SETTINGS.FORM.AUDIO_TRANSCRIPTION.API.SUCCESS'));
+  } catch (error) {
+    useAlert(t('GENERAL_SETTINGS.FORM.AUDIO_TRANSCRIPTION.API.ERROR'));
+  }
+};
+
+const toggleAudioTranscription = async () => {
+  return updateAccountSettings({
+    audio_transcriptions: isEnabled.value,
+  });
+};
+</script>
+
+<template>
+  <SectionLayout
+    :title="t('GENERAL_SETTINGS.FORM.AUDIO_TRANSCRIPTION.TITLE')"
+    :description="t('GENERAL_SETTINGS.FORM.AUDIO_TRANSCRIPTION.NOTE')"
+    with-border
+  >
+    <template #headerActions>
+      <div class="flex justify-end">
+        <Switch v-model="isEnabled" @change="toggleAudioTranscription" />
+      </div>
+    </template>
+  </SectionLayout>
+</template>
--- a/app/models/account.rb
+++ b/app/models/account.rb
@@ -36,6 +36,7 @@ class Account < ApplicationRecord
        'auto_resolve_after': { 'type': %w[integer null], 'minimum': 10, 'maximum': 1_439_856 },
        'auto_resolve_message': { 'type': %w[string null] },
        'auto_resolve_ignore_waiting': { 'type': %w[boolean null] },
+        'audio_transcriptions': { 'type': %w[boolean null] },
        'auto_resolve_label': { 'type': %w[string null] }
      },
    'required': [],
@@ -52,7 +53,8 @@ class Account < ApplicationRecord
                 schema: SETTINGS_PARAMS_SCHEMA,
                 attribute_resolver: ->(record) { record.settings }

-  store_accessor :settings, :auto_resolve_after, :auto_resolve_message, :auto_resolve_ignore_waiting, :auto_resolve_label
+  store_accessor :settings, :auto_resolve_after, :auto_resolve_message, :auto_resolve_ignore_waiting
+  store_accessor :settings, :audio_transcriptions, :auto_resolve_label

  has_many :account_users, dependent: :destroy_async
  has_many :agent_bot_inboxes, dependent: :destroy_async
--- a/app/models/attachment.rb
+++ b/app/models/attachment.rb
@@ -44,11 +44,8 @@ class Attachment < ApplicationRecord

  def push_event_data
    return unless file_type
-    return base_data.merge(location_metadata) if file_type.to_sym == :location
-    return base_data.merge(fallback_data) if file_type.to_sym == :fallback
-    return base_data.merge(contact_metadata) if file_type.to_sym == :contact

-    base_data.merge(file_metadata)
+    base_data.merge(metadata_for_file_type)
  end

  # NOTE: the URl returned does a 301 redirect to the actual file
@@ -76,6 +73,30 @@ class Attachment < ApplicationRecord

  private

+  def metadata_for_file_type
+    case file_type.to_sym
+    when :location
+      location_metadata
+    when :fallback
+      fallback_data
+    when :contact
+      contact_metadata
+    when :audio
+      audio_metadata
+    else
+      file_metadata
+    end
+  end
+
+  def audio_metadata
+    audio_file_data = base_data.merge(file_metadata)
+    audio_file_data.merge(
+      {
+        transcribed_text: meta&.[]('transcribed_text') || ''
+      }
+    )
+  end
+
  def file_metadata
    metadata = {
      extension: extension,
@@ -149,3 +170,5 @@ class Attachment < ApplicationRecord
    file_content_type.start_with?('image/', 'video/', 'audio/')
  end
 end
+
+Attachment.include_mod_with('Concerns::Attachment')
--- a/app/models/message.rb
+++ b/app/models/message.rb
@@ -224,6 +224,11 @@ class Message < ApplicationRecord
    save!
  end

+  def send_update_event
+    Rails.configuration.dispatcher.dispatch(MESSAGE_UPDATED, Time.zone.now, message: self, performed_by: Current.executed_by,
+                                                                            previous_changes: previous_changes)
+  end
+
  private

  def prevent_message_flooding
@@ -313,8 +318,7 @@ class Message < ApplicationRecord
    # we want to skip the update event if the message is not updated
    return if previous_changes.blank?

-    Rails.configuration.dispatcher.dispatch(MESSAGE_UPDATED, Time.zone.now, message: self, performed_by: Current.executed_by,
-                                                                            previous_changes: previous_changes)
+    send_update_event
  end

  def send_reply
--- a/enterprise/app/jobs/captain/conversation/response_builder_job.rb
+++ b/enterprise/app/jobs/captain/conversation/response_builder_job.rb
@@ -49,10 +49,24 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob

  def message_content(message)
    return message.content if message.content.present?
+    return 'User has shared a message without content' unless message.attachments.any?

-    return 'User has shared an attachment' if message.attachments.any?
+    audio_transcriptions = extract_audio_transcriptions(message.attachments)
+    return audio_transcriptions if audio_transcriptions.present?

-    'User has shared a message without content'
+    'User has shared an attachment'
+  end
+
+  def extract_audio_transcriptions(attachments)
+    audio_attachments = attachments.where(file_type: :audio)
+    return '' if audio_attachments.blank?
+
+    transcriptions = ''
+    audio_attachments.each do |attachment|
+      result = Messages::AudioTranscriptionService.new(attachment).perform
+      transcriptions += result[:transcriptions] if result[:success]
+    end
+    transcriptions
  end

  def determine_role(message)
--- a/enterprise/app/jobs/messages/audio_transcription_job.rb
+++ b/enterprise/app/jobs/messages/audio_transcription_job.rb
@@ -0,0 +1,13 @@
+class Messages::AudioTranscriptionJob < ApplicationJob
+  queue_as :low
+
+  def perform(attachment_id)
+    attachment = Attachment.find_by(id: attachment_id)
+    return if attachment.blank?
+
+    Messages::AudioTranscriptionService.new(attachment).perform
+  rescue StandardError => e
+    Rails.logger.error "Error in AudioTranscriptionJob: #{e.message}"
+    ChatwootExceptionTracker.new(e).capture_exception
+  end
+end
--- a/enterprise/app/models/enterprise/concerns/attachment.rb
+++ b/enterprise/app/models/enterprise/concerns/attachment.rb
@@ -0,0 +1,15 @@
+module Enterprise::Concerns::Attachment
+  extend ActiveSupport::Concern
+
+  included do
+    after_create_commit :enqueue_audio_transcription
+  end
+
+  private
+
+  def enqueue_audio_transcription
+    return unless file_type.to_sym == :audio
+
+    Messages::AudioTranscriptionJob.perform_later(id)
+  end
+end
--- a/enterprise/app/services/messages/audio_transcription_service.rb
+++ b/enterprise/app/services/messages/audio_transcription_service.rb
@@ -0,0 +1,67 @@
+class Messages::AudioTranscriptionService < Llm::BaseOpenAiService
+  attr_reader :attachment, :message, :account
+
+  def initialize(attachment)
+    super()
+    @attachment = attachment
+    @message = attachment.message
+    @account = message.account
+  end
+
+  def perform
+    return { error: 'Transcription limit exceeded' } unless can_transcribe?
+    return { error: 'Message not found' } if message.blank?
+
+    begin
+      transcriptions = transcribe_audio
+      Rails.logger.info "Audio transcription successful: #{transcriptions}"
+      { success: true, transcriptions: transcriptions }
+    rescue StandardError => e
+      ChatwootExceptionTracker.new(e).capture_exception
+      Rails.logger.error "Audio transcription failed: #{e.message}"
+      { error: "Transcription failed: #{e.message}" }
+    end
+  end
+
+  private
+
+  def can_transcribe?
+    account.audio_transcriptions.present? && account.usage_limits[:captain][:responses][:current_available].positive?
+  end
+
+  def fetch_audio_file
+    temp_dir = Rails.root.join('tmp/uploads')
+    FileUtils.mkdir_p(temp_dir)
+    temp_file_path = File.join(temp_dir, attachment.file.filename.to_s)
+    File.write(temp_file_path, attachment.file.download, mode: 'wb')
+    temp_file_path
+  end
+
+  def transcribe_audio
+    transcribed_text = attachment.meta&.[]('transcribed_text') || ''
+    return transcribed_text if transcribed_text.present?
+
+    temp_file_path = fetch_audio_file
+
+    response = @client.audio.transcribe(
+      parameters: {
+        model: 'whisper-1',
+        file: File.open(temp_file_path),
+        temperature: 0.4
+      }
+    )
+
+    FileUtils.rm_f(temp_file_path)
+
+    update_transcription(response['text'])
+    response['text']
+  end
+
+  def update_transcription(transcribed_text)
+    return if transcribed_text.blank?
+
+    attachment.update!(meta: { transcribed_text: transcribed_text })
+    message.reload.send_update_event
+    message.account.increment_response_usage
+  end
+end
--- a/spec/enterprise/jobs/messages/audio_transcription_job_spec.rb
+++ b/spec/enterprise/jobs/messages/audio_transcription_job_spec.rb
@@ -0,0 +1,41 @@
+require 'rails_helper'
+
+RSpec.describe Messages::AudioTranscriptionJob do
+  subject(:job) { described_class.perform_later(attachment_id) }
+
+  let(:message) { create(:message) }
+  let(:attachment) do
+    message.attachments.create!(
+      account_id: message.account_id,
+      file_type: :audio,
+      file: fixture_file_upload('public/audio/widget/ding.mp3')
+    )
+  end
+  let(:attachment_id) { attachment.id }
+  let(:conversation) { message.conversation }
+  let(:transcription_service) { instance_double(Messages::AudioTranscriptionService) }
+
+  it 'enqueues the job' do
+    expect { job }.to have_enqueued_job(described_class)
+      .with(attachment_id)
+      .on_queue('low')
+  end
+
+  context 'when performing the job' do
+    before do
+      allow(Messages::AudioTranscriptionService).to receive(:new).with(attachment).and_return(transcription_service)
+      allow(transcription_service).to receive(:perform)
+    end
+
+    it 'calls AudioTranscriptionService with the attachment' do
+      expect(Messages::AudioTranscriptionService).to receive(:new).with(attachment)
+      expect(transcription_service).to receive(:perform)
+      described_class.perform_now(attachment_id)
+    end
+
+    it 'does nothing when attachment is not found' do
+      expect(Messages::AudioTranscriptionService).not_to receive(:new)
+      described_class.perform_now(999_999)
+    end
+  end
+end
--- a/spec/enterprise/services/messages/audio_transcription_service_spec.rb
+++ b/spec/enterprise/services/messages/audio_transcription_service_spec.rb
@@ -0,0 +1,70 @@
+require 'rails_helper'
+
+RSpec.describe Messages::AudioTranscriptionService, type: :service do
+  let(:account) { create(:account, audio_transcriptions: true) }
+  let(:conversation) { create(:conversation, account: account) }
+  let(:message) { create(:message, conversation: conversation) }
+  let(:attachment) { message.attachments.create!(account: account, file_type: :audio) }
+
+  before do
+    # Create required installation configs
+    create(:installation_config, name: 'CAPTAIN_OPEN_AI_API_KEY', value: 'test-api-key')
+    create(:installation_config, name: 'CAPTAIN_OPEN_AI_MODEL', value: 'gpt-4o-mini')
+
+    # Mock usage limits for transcription to be available
+    allow(account).to receive(:usage_limits).and_return({ captain: { responses: { current_available: 100 } } })
+  end
+
+  describe '#perform' do
+    let(:service) { described_class.new(attachment) }
+
+    context 'when transcription is successful' do
+      before do
+        # Mock can_transcribe? to return true and transcribe_audio method
+        allow(service).to receive(:can_transcribe?).and_return(true)
+        allow(service).to receive(:transcribe_audio).and_return('Hello world transcription')
+      end
+
+      it 'returns successful transcription' do
+        result = service.perform
+        expect(result).to eq({ success: true, transcriptions: 'Hello world transcription' })
+      end
+    end
+
+    context 'when audio transcriptions are disabled' do
+      before do
+        account.update!(audio_transcriptions: false)
+      end
+
+      it 'returns error for transcription limit exceeded' do
+        result = service.perform
+        expect(result).to eq({ error: 'Transcription limit exceeded' })
+      end
+    end
+
+    context 'when attachment already has transcribed text' do
+      before do
+        attachment.update!(meta: { transcribed_text: 'Existing transcription' })
+        allow(service).to receive(:can_transcribe?).and_return(true)
+      end
+
+      it 'returns existing transcription without calling API' do
+        result = service.perform
+        expect(result).to eq({ success: true, transcriptions: 'Existing transcription' })
+      end
+    end
+
+    context 'when transcription fails' do
+      before do
+        allow(service).to receive(:can_transcribe?).and_return(true)
+        allow(service).to receive(:transcribe_audio).and_raise(StandardError.new('API error'))
+        allow(ChatwootExceptionTracker).to receive(:new).and_return(instance_double(ChatwootExceptionTracker, capture_exception: nil))
+      end
+
+      it 'returns error response' do
+        result = service.perform
+        expect(result).to eq({ error: 'Transcription failed: API error' })
+      end
+    end
+  end
+end