feat: Add the support for images in Captain (#11850)

This commit is contained in:
Tanmay Deep Sharma
2025-07-11 23:28:46 +07:00
committed by GitHub
parent 802f0694ed
commit 5b9f997fa0
8 changed files with 560 additions and 45 deletions

View File

@@ -25,8 +25,8 @@ class Api::V1::Accounts::Captain::AssistantsController < Api::V1::Accounts::Base
def playground
response = Captain::Llm::AssistantChatService.new(assistant: @assistant).generate_response(
params[:message_content],
message_history
additional_message: params[:message_content],
message_history: message_history
)
render json: response

View File

@@ -1,6 +1,7 @@
class Captain::Conversation::ResponseBuilderJob < ApplicationJob
MAX_MESSAGE_LENGTH = 10_000
retry_on ActiveStorage::FileNotFoundError, attempts: 3
retry_on ActiveStorage::FileNotFoundError, attempts: 3, wait: 2.seconds
retry_on Faraday::BadRequestError, attempts: 3, wait: 2.seconds
def perform(conversation, assistant)
@conversation = conversation
@@ -13,7 +14,7 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
generate_and_process_response
end
rescue StandardError => e
raise e if e.is_a?(ActiveStorage::FileNotFoundError)
raise e if e.is_a?(ActiveStorage::FileNotFoundError) || e.is_a?(Faraday::BadRequestError)
handle_error(e)
ensure
@@ -26,8 +27,7 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
def generate_and_process_response
@response = Captain::Llm::AssistantChatService.new(assistant: @assistant).generate_response(
@conversation.messages.incoming.last.content,
collect_previous_messages
message_history: collect_previous_messages
)
return process_action('handoff') if handoff_requested?
@@ -43,39 +43,19 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
.where(message_type: [:incoming, :outgoing])
.where(private: false)
.map do |message|
{
content: message_content(message),
role: determine_role(message)
}
end
end
def message_content(message)
return message.content if message.content.present?
return 'User has shared a message without content' unless message.attachments.any?
audio_transcriptions = extract_audio_transcriptions(message.attachments)
return audio_transcriptions if audio_transcriptions.present?
'User has shared an attachment'
end
def extract_audio_transcriptions(attachments)
audio_attachments = attachments.where(file_type: :audio)
return '' if audio_attachments.blank?
transcriptions = ''
audio_attachments.each do |attachment|
result = Messages::AudioTranscriptionService.new(attachment).perform
transcriptions += result[:transcriptions] if result[:success]
{
content: prepare_multimodal_message_content(message),
role: determine_role(message)
}
end
transcriptions
end
def determine_role(message)
return 'system' if message.content.blank?
message.message_type == 'incoming' ? 'user' : 'assistant'
end
message.message_type == 'incoming' ? 'user' : 'system'
def prepare_multimodal_message_content(message)
Captain::OpenAiMessageBuilderService.new(message: message).generate_content
end
def handoff_requested?

View File

@@ -12,9 +12,16 @@ class Captain::Llm::AssistantChatService < Llm::BaseOpenAiService
register_tools
end
def generate_response(input, previous_messages = [], role = 'user')
@messages += previous_messages
@messages << { role: role, content: input } if input.present?
# additional_message: A single message (String) from the user that should be appended to the chat.
# It can be an empty String or nil when you only want to supply historical messages.
# message_history: An Array of already formatted messages that provide the previous context.
# role: The role for the additional_message (defaults to `user`).
#
# NOTE: Parameters are provided as keyword arguments to improve clarity and avoid relying on
# positional ordering.
def generate_response(additional_message: nil, message_history: [], role: 'user')
@messages += message_history
@messages << { role: role, content: additional_message } if additional_message.present?
request_chat_completion
end

View File

@@ -0,0 +1,60 @@
class Captain::OpenAiMessageBuilderService
pattr_initialize [:message!]
def generate_content
parts = []
parts << text_part(@message.content) if @message.content.present?
parts.concat(attachment_parts(@message.attachments)) if @message.attachments.any?
return 'Message without content' if parts.blank?
return parts.first[:text] if parts.one? && parts.first[:type] == 'text'
parts
end
private
def text_part(text)
{ type: 'text', text: text }
end
def image_part(image_url)
{ type: 'image_url', image_url: { url: image_url } }
end
def attachment_parts(attachments)
image_attachments = attachments.where(file_type: :image)
image_content = image_parts(image_attachments)
transcription = extract_audio_transcriptions(attachments)
transcription_part = text_part(transcription) if transcription.present?
attachment_part = text_part('User has shared an attachment') if attachments.where.not(file_type: %i[image audio]).exists?
[image_content, transcription_part, attachment_part].flatten.compact
end
def image_parts(image_attachments)
image_attachments.each_with_object([]) do |attachment, parts|
url = get_attachment_url(attachment)
parts << image_part(url) if url.present?
end
end
def get_attachment_url(attachment)
return attachment.download_url if attachment.download_url.present?
return attachment.external_url if attachment.external_url.present?
attachment.file.attached? ? attachment.file_url : nil
end
def extract_audio_transcriptions(attachments)
audio_attachments = attachments.where(file_type: :audio)
return '' if audio_attachments.blank?
audio_attachments.map do |attachment|
result = Messages::AudioTranscriptionService.new(attachment).perform
result[:success] ? result[:transcriptions] : ''
end.join
end
end

View File

@@ -1,13 +1,34 @@
module Enterprise::MessageTemplates::HookExecutionService
MAX_ATTACHMENT_WAIT_SECONDS = 4
def trigger_templates
super
return unless should_process_captain_response?
return perform_handoff unless inbox.captain_active?
Captain::Conversation::ResponseBuilderJob.perform_later(
conversation,
conversation.inbox.captain_assistant
)
schedule_captain_response
end
private
def schedule_captain_response
job_args = [conversation, conversation.inbox.captain_assistant]
if message.attachments.blank?
Captain::Conversation::ResponseBuilderJob.perform_later(*job_args)
else
wait_time = calculate_attachment_wait_time
Captain::Conversation::ResponseBuilderJob.set(wait: wait_time).perform_later(*job_args)
end
end
def calculate_attachment_wait_time
attachment_count = message.attachments.size
base_wait = 1.second
# Wait longer for more attachments or larger files
additional_wait = [attachment_count * 1, MAX_ATTACHMENT_WAIT_SECONDS].min.seconds
base_wait + additional_wait
end
def should_process_captain_response?

View File

@@ -211,8 +211,8 @@ RSpec.describe 'Api::V1::Accounts::Captain::Assistants', type: :request do
expect(response).to have_http_status(:success)
expect(chat_service).to have_received(:generate_response).with(
valid_params[:message_content],
valid_params[:message_history]
additional_message: valid_params[:message_content],
message_history: valid_params[:message_history]
)
expect(json_response[:content]).to eq('Assistant response')
end
@@ -232,8 +232,8 @@ RSpec.describe 'Api::V1::Accounts::Captain::Assistants', type: :request do
expect(response).to have_http_status(:success)
expect(chat_service).to have_received(:generate_response).with(
params_without_history[:message_content],
[]
additional_message: params_without_history[:message_content],
message_history: []
)
end
end

View File

@@ -30,5 +30,142 @@ RSpec.describe Captain::Conversation::ResponseBuilderJob, type: :job do
account.reload
expect(account.usage_limits[:captain][:responses][:consumed]).to eq(1)
end
context 'when message contains an image' do
let(:message_with_image) { create(:message, conversation: conversation, message_type: :incoming, content: 'Can you help with this error?') }
let(:image_attachment) { message_with_image.attachments.create!(account: account, file_type: :image, external_url: 'https://example.com/error.jpg') }
before do
image_attachment
end
it 'includes image URL directly in the message content for OpenAI vision analysis' do
# Expect the generate_response to receive multimodal content with image URL
expect(mock_llm_chat_service).to receive(:generate_response) do |**kwargs|
history = kwargs[:message_history]
last_entry = history.last
expect(last_entry[:content]).to be_an(Array)
expect(last_entry[:content].any? { |part| part[:type] == 'text' && part[:text] == 'Can you help with this error?' }).to be true
expect(last_entry[:content].any? do |part|
part[:type] == 'image_url' && part[:image_url][:url] == 'https://example.com/error.jpg'
end).to be true
{ 'response' => 'I can see the error in your image. It appears to be a database connection issue.' }
end
described_class.perform_now(conversation, assistant)
end
end
end
describe 'retry mechanisms for image processing' do
let(:conversation) { create(:conversation, inbox: inbox, account: account) }
let(:mock_llm_chat_service) { instance_double(Captain::Llm::AssistantChatService) }
let(:mock_message_builder) { instance_double(Captain::OpenAiMessageBuilderService) }
before do
create(:message, conversation: conversation, content: 'Hello with image', message_type: :incoming)
allow(Captain::Llm::AssistantChatService).to receive(:new).and_return(mock_llm_chat_service)
allow(Captain::OpenAiMessageBuilderService).to receive(:new).with(message: anything).and_return(mock_message_builder)
allow(mock_message_builder).to receive(:generate_content).and_return('Hello with image')
allow(mock_llm_chat_service).to receive(:generate_response).and_return({ 'response' => 'Test response' })
end
context 'when ActiveStorage::FileNotFoundError occurs' do
it 'handles file errors and triggers handoff' do
allow(mock_message_builder).to receive(:generate_content)
.and_raise(ActiveStorage::FileNotFoundError, 'Image file not found')
# For retryable errors, the job should handle them and proceed with handoff
described_class.perform_now(conversation, assistant)
# Verify handoff occurred due to repeated failures
expect(conversation.reload.status).to eq('open')
end
it 'succeeds when no error occurs' do
# Don't raise any error, should succeed normally
allow(mock_message_builder).to receive(:generate_content)
.and_return('Image content processed successfully')
described_class.perform_now(conversation, assistant)
expect(conversation.messages.outgoing.count).to eq(1)
expect(conversation.messages.outgoing.last.content).to eq('Test response')
end
end
context 'when Faraday::BadRequestError occurs' do
it 'handles API errors and triggers handoff' do
allow(mock_llm_chat_service).to receive(:generate_response)
.and_raise(Faraday::BadRequestError, 'Bad request to image service')
described_class.perform_now(conversation, assistant)
expect(conversation.reload.status).to eq('open')
end
it 'succeeds when no error occurs' do
# Don't raise any error, should succeed normally
allow(mock_llm_chat_service).to receive(:generate_response)
.and_return({ 'response' => 'Response after retry' })
described_class.perform_now(conversation, assistant)
expect(conversation.messages.outgoing.last.content).to eq('Response after retry')
end
end
context 'when image processing fails permanently' do
before do
allow(mock_message_builder).to receive(:generate_content)
.and_raise(ActiveStorage::FileNotFoundError, 'Image permanently unavailable')
end
it 'triggers handoff after max retries' do
# Since perform_now re-raises retryable errors, simulate the final failure after retries
allow(mock_message_builder).to receive(:generate_content)
.and_raise(StandardError, 'Max retries exceeded')
expect(ChatwootExceptionTracker).to receive(:new).and_call_original
described_class.perform_now(conversation, assistant)
expect(conversation.reload.status).to eq('open')
end
end
context 'when non-retryable error occurs' do
let(:standard_error) { StandardError.new('Generic error') }
before do
allow(mock_llm_chat_service).to receive(:generate_response).and_raise(standard_error)
end
it 'handles error and triggers handoff' do
expect(ChatwootExceptionTracker).to receive(:new)
.with(standard_error, account: account)
.and_call_original
described_class.perform_now(conversation, assistant)
expect(conversation.reload.status).to eq('open')
end
it 'ensures Current.executed_by is reset' do
expect(Current).to receive(:executed_by=).with(assistant)
expect(Current).to receive(:executed_by=).with(nil)
described_class.perform_now(conversation, assistant)
end
end
end
describe 'job configuration' do
it 'has retry_on configuration for retryable errors' do
expect(described_class).to respond_to(:retry_on)
end
it 'defines MAX_MESSAGE_LENGTH constant' do
expect(described_class::MAX_MESSAGE_LENGTH).to eq(10_000)
end
end
end

View File

@@ -0,0 +1,310 @@
require 'rails_helper'
RSpec.describe Captain::OpenAiMessageBuilderService do
subject(:service) { described_class.new(message: message) }
let(:message) { create(:message, content: 'Hello world') }
describe '#generate_content' do
context 'when message has only text content' do
it 'returns the text content directly' do
expect(service.generate_content).to eq('Hello world')
end
end
context 'when message has no content and no attachments' do
let(:message) { create(:message, content: nil) }
it 'returns default message' do
expect(service.generate_content).to eq('Message without content')
end
end
context 'when message has text content and attachments' do
before do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
attachment.save!
end
it 'returns an array of content parts' do
result = service.generate_content
expect(result).to be_an(Array)
expect(result).to include({ type: 'text', text: 'Hello world' })
expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
end
end
context 'when message has only non-text attachments' do
let(:message) { create(:message, content: nil) }
before do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
attachment.save!
end
it 'returns an array of content parts without text' do
result = service.generate_content
expect(result).to be_an(Array)
expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
expect(result).not_to include(hash_including(type: 'text', text: 'Hello world'))
end
end
end
describe '#attachment_parts' do
let(:message) { create(:message, content: nil) }
let(:attachments) { message.attachments }
context 'with image attachments' do
before do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
attachment.save!
end
it 'includes image parts' do
result = service.send(:attachment_parts, attachments)
expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
end
end
context 'with audio attachments' do
let(:audio_attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
attachment.save!
attachment
end
before do
allow(Messages::AudioTranscriptionService).to receive(:new).with(audio_attachment).and_return(
instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'Audio transcription text' })
)
end
it 'includes transcription text part' do
audio_attachment # trigger creation
result = service.send(:attachment_parts, attachments)
expect(result).to include({ type: 'text', text: 'Audio transcription text' })
end
end
context 'with other file types' do
before do
attachment = message.attachments.build(account_id: message.account_id, file_type: :file)
attachment.save!
end
it 'includes generic attachment message' do
result = service.send(:attachment_parts, attachments)
expect(result).to include({ type: 'text', text: 'User has shared an attachment' })
end
end
context 'with mixed attachment types' do
let(:image_attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
attachment.save!
attachment
end
let(:audio_attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
attachment.save!
attachment
end
let(:document_attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :file)
attachment.save!
attachment
end
before do
allow(Messages::AudioTranscriptionService).to receive(:new).with(audio_attachment).and_return(
instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'Audio text' })
)
end
it 'includes all relevant parts' do
image_attachment # trigger creation
audio_attachment # trigger creation
document_attachment # trigger creation
result = service.send(:attachment_parts, attachments)
expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
expect(result).to include({ type: 'text', text: 'Audio text' })
expect(result).to include({ type: 'text', text: 'User has shared an attachment' })
end
end
end
describe '#image_parts' do
let(:message) { create(:message, content: nil) }
context 'with valid image attachments' do
let(:image1) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image1.jpg')
attachment.save!
attachment
end
let(:image2) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image2.jpg')
attachment.save!
attachment
end
it 'returns image parts for all valid images' do
image1 # trigger creation
image2 # trigger creation
image_attachments = message.attachments.where(file_type: :image)
result = service.send(:image_parts, image_attachments)
expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image1.jpg' } })
expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image2.jpg' } })
end
end
context 'with image attachments without URLs' do
let(:image_attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: nil)
attachment.save!
attachment
end
before do
allow(image_attachment).to receive(:file).and_return(instance_double(ActiveStorage::Attached::One, attached?: false))
end
it 'skips images without valid URLs' do
image_attachment # trigger creation
image_attachments = message.attachments.where(file_type: :image)
result = service.send(:image_parts, image_attachments)
expect(result).to be_empty
end
end
end
describe '#get_attachment_url' do
let(:attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :image)
attachment.save!
attachment
end
context 'when attachment has external_url' do
before { attachment.update(external_url: 'https://example.com/image.jpg') }
it 'returns external_url' do
expect(service.send(:get_attachment_url, attachment)).to eq('https://example.com/image.jpg')
end
end
context 'when attachment has attached file' do
before do
attachment.update(external_url: nil)
allow(attachment).to receive(:file).and_return(instance_double(ActiveStorage::Attached::One, attached?: true))
allow(attachment).to receive(:file_url).and_return('https://local.com/file.jpg')
allow(attachment).to receive(:download_url).and_return('')
end
it 'returns file_url' do
expect(service.send(:get_attachment_url, attachment)).to eq('https://local.com/file.jpg')
end
end
context 'when attachment has no URL or file' do
before do
attachment.update(external_url: nil)
allow(attachment).to receive(:file).and_return(instance_double(ActiveStorage::Attached::One, attached?: false))
end
it 'returns nil' do
expect(service.send(:get_attachment_url, attachment)).to be_nil
end
end
end
describe '#extract_audio_transcriptions' do
let(:message) { create(:message, content: nil) }
context 'with no audio attachments' do
it 'returns empty string' do
result = service.send(:extract_audio_transcriptions, message.attachments)
expect(result).to eq('')
end
end
context 'with successful audio transcriptions' do
let(:audio1) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
attachment.save!
attachment
end
let(:audio2) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
attachment.save!
attachment
end
before do
allow(Messages::AudioTranscriptionService).to receive(:new).with(audio1).and_return(
instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'First audio text. ' })
)
allow(Messages::AudioTranscriptionService).to receive(:new).with(audio2).and_return(
instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'Second audio text.' })
)
end
it 'concatenates all successful transcriptions' do
audio1 # trigger creation
audio2 # trigger creation
attachments = message.attachments
result = service.send(:extract_audio_transcriptions, attachments)
expect(result).to eq('First audio text. Second audio text.')
end
end
context 'with failed audio transcriptions' do
let(:audio_attachment) do
attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
attachment.save!
attachment
end
before do
allow(Messages::AudioTranscriptionService).to receive(:new).with(audio_attachment).and_return(
instance_double(Messages::AudioTranscriptionService, perform: { success: false, transcriptions: nil })
)
end
it 'returns empty string for failed transcriptions' do
audio_attachment # trigger creation
attachments = message.attachments
result = service.send(:extract_audio_transcriptions, attachments)
expect(result).to eq('')
end
end
end
describe 'private helper methods' do
describe '#text_part' do
it 'returns correct text part format' do
result = service.send(:text_part, 'Hello world')
expect(result).to eq({ type: 'text', text: 'Hello world' })
end
end
describe '#image_part' do
it 'returns correct image part format' do
result = service.send(:image_part, 'https://example.com/image.jpg')
expect(result).to eq({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
end
end
end
end