mirror of
https://github.com/lingble/chatwoot.git
synced 2025-10-31 19:17:48 +00:00
feat: Add BE changes for captain pdf support for faq generation (#12113)
This commit is contained in:
committed by
GitHub
parent
3cefa9b767
commit
1ba00075ce
@@ -292,6 +292,26 @@ en:
|
|||||||
completed_tool_call: 'Completed %{function_name} tool call'
|
completed_tool_call: 'Completed %{function_name} tool call'
|
||||||
invalid_tool_call: 'Invalid tool call'
|
invalid_tool_call: 'Invalid tool call'
|
||||||
tool_not_available: 'Tool not available'
|
tool_not_available: 'Tool not available'
|
||||||
|
documents:
|
||||||
|
limit_exceeded: 'Document limit exceeded'
|
||||||
|
pdf_format_error: 'must be a PDF file'
|
||||||
|
pdf_size_error: 'must be less than 10MB'
|
||||||
|
pdf_upload_failed: 'Failed to upload PDF to OpenAI'
|
||||||
|
pdf_upload_success: 'PDF uploaded successfully with file_id: %{file_id}'
|
||||||
|
pdf_processing_failed: 'Failed to process PDF document %{document_id}: %{error}'
|
||||||
|
pdf_processing_success: 'Successfully processed PDF document %{document_id}'
|
||||||
|
faq_generation_complete: 'FAQ generation complete. Total FAQs created: %{count}'
|
||||||
|
using_paginated_faq: 'Using paginated FAQ generation for document %{document_id}'
|
||||||
|
using_standard_faq: 'Using standard FAQ generation for document %{document_id}'
|
||||||
|
response_creation_error: 'Error in creating response document: %{error}'
|
||||||
|
missing_openai_file_id: 'Document must have openai_file_id for paginated processing'
|
||||||
|
openai_api_error: 'OpenAI API Error: %{error}'
|
||||||
|
starting_paginated_faq: 'Starting paginated FAQ generation (%{pages_per_chunk} pages per chunk)'
|
||||||
|
stopping_faq_generation: 'Stopping processing. Reason: %{reason}'
|
||||||
|
paginated_faq_complete: 'Paginated generation complete. Total FAQs: %{total_faqs}, Pages processed: %{pages_processed}'
|
||||||
|
processing_pages: 'Processing pages %{start}-%{end} (iteration %{iteration})'
|
||||||
|
chunk_generated: 'Chunk generated %{chunk_faqs} FAQs. Total so far: %{total_faqs}'
|
||||||
|
page_processing_error: 'Error processing pages %{start}-%{end}: %{error}'
|
||||||
public_portal:
|
public_portal:
|
||||||
search:
|
search:
|
||||||
search_placeholder: Search for article by title or body...
|
search_placeholder: Search for article by title or body...
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
class AddMetadataToCaptainDocuments < ActiveRecord::Migration[7.1]
|
||||||
|
def change
|
||||||
|
add_column :captain_documents, :metadata, :jsonb, default: {}
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -320,6 +320,7 @@ ActiveRecord::Schema[7.1].define(version: 2025_08_22_061042) do
|
|||||||
t.datetime "created_at", null: false
|
t.datetime "created_at", null: false
|
||||||
t.datetime "updated_at", null: false
|
t.datetime "updated_at", null: false
|
||||||
t.integer "status", default: 0, null: false
|
t.integer "status", default: 0, null: false
|
||||||
|
t.jsonb "metadata", default: {}
|
||||||
t.index ["account_id"], name: "index_captain_documents_on_account_id"
|
t.index ["account_id"], name: "index_captain_documents_on_account_id"
|
||||||
t.index ["assistant_id", "external_link"], name: "index_captain_documents_on_assistant_id_and_external_link", unique: true
|
t.index ["assistant_id", "external_link"], name: "index_captain_documents_on_assistant_id_and_external_link", unique: true
|
||||||
t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id"
|
t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id"
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
|
|||||||
@document.save!
|
@document.save!
|
||||||
rescue Captain::Document::LimitExceededError => e
|
rescue Captain::Document::LimitExceededError => e
|
||||||
render_could_not_create_error(e.message)
|
render_could_not_create_error(e.message)
|
||||||
|
rescue ActiveRecord::RecordInvalid => e
|
||||||
|
render_could_not_create_error(e.record.errors.full_messages.join(', '))
|
||||||
end
|
end
|
||||||
|
|
||||||
def destroy
|
def destroy
|
||||||
@@ -55,6 +57,6 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
|
|||||||
end
|
end
|
||||||
|
|
||||||
def document_params
|
def document_params
|
||||||
params.require(:document).permit(:name, :external_link, :assistant_id)
|
params.require(:document).permit(:name, :external_link, :assistant_id, :pdf_file)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -2,7 +2,9 @@ class Captain::Documents::CrawlJob < ApplicationJob
|
|||||||
queue_as :low
|
queue_as :low
|
||||||
|
|
||||||
def perform(document)
|
def perform(document)
|
||||||
if InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
|
if document.pdf_document?
|
||||||
|
perform_pdf_processing(document)
|
||||||
|
elsif InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
|
||||||
perform_firecrawl_crawl(document)
|
perform_firecrawl_crawl(document)
|
||||||
else
|
else
|
||||||
perform_simple_crawl(document)
|
perform_simple_crawl(document)
|
||||||
@@ -13,6 +15,14 @@ class Captain::Documents::CrawlJob < ApplicationJob
|
|||||||
|
|
||||||
include Captain::FirecrawlHelper
|
include Captain::FirecrawlHelper
|
||||||
|
|
||||||
|
def perform_pdf_processing(document)
|
||||||
|
Captain::Llm::PdfProcessingService.new(document).process
|
||||||
|
document.update!(status: :available)
|
||||||
|
rescue StandardError => e
|
||||||
|
Rails.logger.error I18n.t('captain.documents.pdf_processing_failed', document_id: document.id, error: e.message)
|
||||||
|
raise # Re-raise to let job framework handle retry logic
|
||||||
|
end
|
||||||
|
|
||||||
def perform_simple_crawl(document)
|
def perform_simple_crawl(document)
|
||||||
page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links
|
page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,65 @@
|
|||||||
class Captain::Documents::ResponseBuilderJob < ApplicationJob
|
class Captain::Documents::ResponseBuilderJob < ApplicationJob
|
||||||
queue_as :low
|
queue_as :low
|
||||||
|
|
||||||
def perform(document)
|
def perform(document, options = {})
|
||||||
reset_previous_responses(document)
|
reset_previous_responses(document)
|
||||||
|
|
||||||
faqs = Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
|
faqs = generate_faqs(document, options)
|
||||||
faqs.each do |faq|
|
create_responses_from_faqs(faqs, document)
|
||||||
create_response(faq, document)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
|
def generate_faqs(document, options)
|
||||||
|
if should_use_pagination?(document)
|
||||||
|
generate_paginated_faqs(document, options)
|
||||||
|
else
|
||||||
|
generate_standard_faqs(document)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def generate_paginated_faqs(document, options)
|
||||||
|
service = build_paginated_service(document, options)
|
||||||
|
faqs = service.generate
|
||||||
|
store_paginated_metadata(document, service)
|
||||||
|
faqs
|
||||||
|
end
|
||||||
|
|
||||||
|
def generate_standard_faqs(document)
|
||||||
|
Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
|
||||||
|
end
|
||||||
|
|
||||||
|
def build_paginated_service(document, options)
|
||||||
|
Captain::Llm::PaginatedFaqGeneratorService.new(
|
||||||
|
document,
|
||||||
|
pages_per_chunk: options[:pages_per_chunk],
|
||||||
|
max_pages: options[:max_pages]
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def store_paginated_metadata(document, service)
|
||||||
|
document.update!(
|
||||||
|
metadata: (document.metadata || {}).merge(
|
||||||
|
'faq_generation' => {
|
||||||
|
'method' => 'paginated',
|
||||||
|
'pages_processed' => service.total_pages_processed,
|
||||||
|
'iterations' => service.iterations_completed,
|
||||||
|
'timestamp' => Time.current.iso8601
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def create_responses_from_faqs(faqs, document)
|
||||||
|
faqs.each { |faq| create_response(faq, document) }
|
||||||
|
end
|
||||||
|
|
||||||
|
def should_use_pagination?(document)
|
||||||
|
# Auto-detect when to use pagination
|
||||||
|
# For now, use pagination for PDFs with OpenAI file ID
|
||||||
|
document.pdf_document? && document.openai_file_id.present?
|
||||||
|
end
|
||||||
|
|
||||||
def reset_previous_responses(response_document)
|
def reset_previous_responses(response_document)
|
||||||
response_document.responses.destroy_all
|
response_document.responses.destroy_all
|
||||||
end
|
end
|
||||||
@@ -24,6 +72,6 @@ class Captain::Documents::ResponseBuilderJob < ApplicationJob
|
|||||||
documentable: document
|
documentable: document
|
||||||
)
|
)
|
||||||
rescue ActiveRecord::RecordInvalid => e
|
rescue ActiveRecord::RecordInvalid => e
|
||||||
Rails.logger.error "Error in creating response document: #{e.message}"
|
Rails.logger.error I18n.t('captain.documents.response_creation_error', error: e.message)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# id :bigint not null, primary key
|
# id :bigint not null, primary key
|
||||||
# content :text
|
# content :text
|
||||||
# external_link :string not null
|
# external_link :string not null
|
||||||
|
# metadata :jsonb
|
||||||
# name :string
|
# name :string
|
||||||
# status :integer default("in_progress"), not null
|
# status :integer default("in_progress"), not null
|
||||||
# created_at :datetime not null
|
# created_at :datetime not null
|
||||||
@@ -26,11 +27,16 @@ class Captain::Document < ApplicationRecord
|
|||||||
belongs_to :assistant, class_name: 'Captain::Assistant'
|
belongs_to :assistant, class_name: 'Captain::Assistant'
|
||||||
has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable
|
has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable
|
||||||
belongs_to :account
|
belongs_to :account
|
||||||
|
has_one_attached :pdf_file
|
||||||
|
|
||||||
validates :external_link, presence: true
|
validates :external_link, presence: true, unless: -> { pdf_file.attached? }
|
||||||
validates :external_link, uniqueness: { scope: :assistant_id }
|
validates :external_link, uniqueness: { scope: :assistant_id }, allow_blank: true
|
||||||
validates :content, length: { maximum: 200_000 }
|
validates :content, length: { maximum: 200_000 }
|
||||||
|
validates :pdf_file, presence: true, if: :pdf_document?
|
||||||
|
validate :validate_pdf_format, if: :pdf_document?
|
||||||
|
validate :validate_file_attachment, if: -> { pdf_file.attached? }
|
||||||
before_validation :ensure_account_id
|
before_validation :ensure_account_id
|
||||||
|
before_validation :set_external_link_for_pdf
|
||||||
|
|
||||||
enum status: {
|
enum status: {
|
||||||
in_progress: 0,
|
in_progress: 0,
|
||||||
@@ -41,12 +47,44 @@ class Captain::Document < ApplicationRecord
|
|||||||
after_create_commit :enqueue_crawl_job
|
after_create_commit :enqueue_crawl_job
|
||||||
after_create_commit :update_document_usage
|
after_create_commit :update_document_usage
|
||||||
after_destroy :update_document_usage
|
after_destroy :update_document_usage
|
||||||
after_commit :enqueue_response_builder_job
|
after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder?
|
||||||
scope :ordered, -> { order(created_at: :desc) }
|
scope :ordered, -> { order(created_at: :desc) }
|
||||||
|
|
||||||
scope :for_account, ->(account_id) { where(account_id: account_id) }
|
scope :for_account, ->(account_id) { where(account_id: account_id) }
|
||||||
scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) }
|
scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) }
|
||||||
|
|
||||||
|
def pdf_document?
|
||||||
|
return true if pdf_file.attached? && pdf_file.blob.content_type == 'application/pdf'
|
||||||
|
|
||||||
|
external_link&.ends_with?('.pdf')
|
||||||
|
end
|
||||||
|
|
||||||
|
def content_type
|
||||||
|
pdf_file.blob.content_type if pdf_file.attached?
|
||||||
|
end
|
||||||
|
|
||||||
|
def file_size
|
||||||
|
pdf_file.blob.byte_size if pdf_file.attached?
|
||||||
|
end
|
||||||
|
|
||||||
|
def openai_file_id
|
||||||
|
metadata&.dig('openai_file_id')
|
||||||
|
end
|
||||||
|
|
||||||
|
def store_openai_file_id(file_id)
|
||||||
|
update!(metadata: (metadata || {}).merge('openai_file_id' => file_id))
|
||||||
|
end
|
||||||
|
|
||||||
|
def display_url
|
||||||
|
return external_link if external_link.present? && !external_link.start_with?('PDF:')
|
||||||
|
|
||||||
|
if pdf_file.attached?
|
||||||
|
Rails.application.routes.url_helpers.rails_blob_url(pdf_file, only_path: false)
|
||||||
|
else
|
||||||
|
external_link
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def enqueue_crawl_job
|
def enqueue_crawl_job
|
||||||
@@ -61,6 +99,12 @@ class Captain::Document < ApplicationRecord
|
|||||||
Captain::Documents::ResponseBuilderJob.perform_later(self)
|
Captain::Documents::ResponseBuilderJob.perform_later(self)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def should_enqueue_response_builder?
|
||||||
|
# Only enqueue when status changes to available
|
||||||
|
# Avoid re-enqueueing when metadata is updated by the job itself
|
||||||
|
saved_change_to_status? && status == 'available'
|
||||||
|
end
|
||||||
|
|
||||||
def update_document_usage
|
def update_document_usage
|
||||||
account.update_document_usage
|
account.update_document_usage
|
||||||
end
|
end
|
||||||
@@ -71,6 +115,29 @@ class Captain::Document < ApplicationRecord
|
|||||||
|
|
||||||
def ensure_within_plan_limit
|
def ensure_within_plan_limit
|
||||||
limits = account.usage_limits[:captain][:documents]
|
limits = account.usage_limits[:captain][:documents]
|
||||||
raise LimitExceededError, 'Document limit exceeded' unless limits[:current_available].positive?
|
raise LimitExceededError, I18n.t('captain.documents.limit_exceeded') unless limits[:current_available].positive?
|
||||||
|
end
|
||||||
|
|
||||||
|
def validate_pdf_format
|
||||||
|
return unless pdf_file.attached?
|
||||||
|
|
||||||
|
errors.add(:pdf_file, I18n.t('captain.documents.pdf_format_error')) unless pdf_file.blob.content_type == 'application/pdf'
|
||||||
|
end
|
||||||
|
|
||||||
|
def validate_file_attachment
|
||||||
|
return unless pdf_file.attached?
|
||||||
|
|
||||||
|
return unless pdf_file.blob.byte_size > 10.megabytes
|
||||||
|
|
||||||
|
errors.add(:pdf_file, I18n.t('captain.documents.pdf_size_error'))
|
||||||
|
end
|
||||||
|
|
||||||
|
def set_external_link_for_pdf
|
||||||
|
return unless pdf_file.attached? && external_link.blank?
|
||||||
|
|
||||||
|
# Set a unique external_link for PDF files
|
||||||
|
# Format: PDF: filename_timestamp (without extension)
|
||||||
|
timestamp = Time.current.strftime('%Y%m%d%H%M%S')
|
||||||
|
self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -0,0 +1,199 @@
|
|||||||
|
class Captain::Llm::PaginatedFaqGeneratorService < Llm::BaseOpenAiService
|
||||||
|
# Default pages per chunk - easily configurable
|
||||||
|
DEFAULT_PAGES_PER_CHUNK = 10
|
||||||
|
MAX_ITERATIONS = 20 # Safety limit to prevent infinite loops
|
||||||
|
|
||||||
|
attr_reader :total_pages_processed, :iterations_completed
|
||||||
|
|
||||||
|
def initialize(document, options = {})
|
||||||
|
super()
|
||||||
|
@document = document
|
||||||
|
@pages_per_chunk = options[:pages_per_chunk] || DEFAULT_PAGES_PER_CHUNK
|
||||||
|
@max_pages = options[:max_pages] # Optional limit from UI
|
||||||
|
@total_pages_processed = 0
|
||||||
|
@iterations_completed = 0
|
||||||
|
@model = OpenAiConstants::PDF_PROCESSING_MODEL
|
||||||
|
end
|
||||||
|
|
||||||
|
def generate
|
||||||
|
raise CustomExceptions::PdfFaqGenerationError, I18n.t('captain.documents.missing_openai_file_id') if @document&.openai_file_id.blank?
|
||||||
|
|
||||||
|
generate_paginated_faqs
|
||||||
|
end
|
||||||
|
|
||||||
|
# Method to check if we should continue processing
|
||||||
|
def should_continue_processing?(last_chunk_result)
|
||||||
|
# Stop if we've hit the maximum iterations
|
||||||
|
return false if @iterations_completed >= MAX_ITERATIONS
|
||||||
|
|
||||||
|
# Stop if we've processed the maximum pages specified
|
||||||
|
return false if @max_pages && @total_pages_processed >= @max_pages
|
||||||
|
|
||||||
|
# Stop if the last chunk returned no FAQs (likely no more content)
|
||||||
|
return false if last_chunk_result[:faqs].empty?
|
||||||
|
|
||||||
|
# Stop if the LLM explicitly indicates no more content
|
||||||
|
return false if last_chunk_result[:has_content] == false
|
||||||
|
|
||||||
|
# Continue processing
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def generate_standard_faqs
|
||||||
|
response = @client.chat(parameters: standard_chat_parameters)
|
||||||
|
parse_response(response)
|
||||||
|
rescue OpenAI::Error => e
|
||||||
|
Rails.logger.error I18n.t('captain.documents.openai_api_error', error: e.message)
|
||||||
|
[]
|
||||||
|
end
|
||||||
|
|
||||||
|
def generate_paginated_faqs
|
||||||
|
all_faqs = []
|
||||||
|
current_page = 1
|
||||||
|
|
||||||
|
loop do
|
||||||
|
end_page = calculate_end_page(current_page)
|
||||||
|
chunk_result = process_chunk_and_update_state(current_page, end_page, all_faqs)
|
||||||
|
|
||||||
|
break unless should_continue_processing?(chunk_result)
|
||||||
|
|
||||||
|
current_page = end_page + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
deduplicate_faqs(all_faqs)
|
||||||
|
end
|
||||||
|
|
||||||
|
def calculate_end_page(current_page)
|
||||||
|
end_page = current_page + @pages_per_chunk - 1
|
||||||
|
@max_pages && end_page > @max_pages ? @max_pages : end_page
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_chunk_and_update_state(current_page, end_page, all_faqs)
|
||||||
|
chunk_result = process_page_chunk(current_page, end_page)
|
||||||
|
chunk_faqs = chunk_result[:faqs]
|
||||||
|
|
||||||
|
all_faqs.concat(chunk_faqs)
|
||||||
|
@total_pages_processed = end_page
|
||||||
|
@iterations_completed += 1
|
||||||
|
|
||||||
|
chunk_result
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_page_chunk(start_page, end_page)
|
||||||
|
params = build_chunk_parameters(start_page, end_page)
|
||||||
|
response = @client.chat(parameters: params)
|
||||||
|
result = parse_chunk_response(response)
|
||||||
|
{ faqs: result['faqs'] || [], has_content: result['has_content'] != false }
|
||||||
|
rescue OpenAI::Error => e
|
||||||
|
Rails.logger.error I18n.t('captain.documents.page_processing_error', start: start_page, end: end_page, error: e.message)
|
||||||
|
{ faqs: [], has_content: false }
|
||||||
|
end
|
||||||
|
|
||||||
|
def build_chunk_parameters(start_page, end_page)
|
||||||
|
{
|
||||||
|
model: @model,
|
||||||
|
response_format: { type: 'json_object' },
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: build_user_content(start_page, end_page)
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
def build_user_content(start_page, end_page)
|
||||||
|
[
|
||||||
|
{
|
||||||
|
type: 'file',
|
||||||
|
file: { file_id: @document.openai_file_id }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: 'text',
|
||||||
|
text: page_chunk_prompt(start_page, end_page)
|
||||||
|
}
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
def page_chunk_prompt(start_page, end_page)
|
||||||
|
Captain::Llm::SystemPromptsService.paginated_faq_generator(start_page, end_page)
|
||||||
|
end
|
||||||
|
|
||||||
|
def standard_chat_parameters
|
||||||
|
{
|
||||||
|
model: @model,
|
||||||
|
response_format: { type: 'json_object' },
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'system',
|
||||||
|
content: Captain::Llm::SystemPromptsService.faq_generator
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: @content
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
def parse_response(response)
|
||||||
|
content = response.dig('choices', 0, 'message', 'content')
|
||||||
|
return [] if content.nil?
|
||||||
|
|
||||||
|
JSON.parse(content.strip).fetch('faqs', [])
|
||||||
|
rescue JSON::ParserError => e
|
||||||
|
Rails.logger.error "Error parsing response: #{e.message}"
|
||||||
|
[]
|
||||||
|
end
|
||||||
|
|
||||||
|
def parse_chunk_response(response)
|
||||||
|
content = response.dig('choices', 0, 'message', 'content')
|
||||||
|
return { 'faqs' => [], 'has_content' => false } if content.nil?
|
||||||
|
|
||||||
|
JSON.parse(content.strip)
|
||||||
|
rescue JSON::ParserError => e
|
||||||
|
Rails.logger.error "Error parsing chunk response: #{e.message}"
|
||||||
|
{ 'faqs' => [], 'has_content' => false }
|
||||||
|
end
|
||||||
|
|
||||||
|
def deduplicate_faqs(faqs)
|
||||||
|
# Remove exact duplicates
|
||||||
|
unique_faqs = faqs.uniq { |faq| faq['question'].downcase.strip }
|
||||||
|
|
||||||
|
# Remove similar questions
|
||||||
|
final_faqs = []
|
||||||
|
unique_faqs.each do |faq|
|
||||||
|
similar_exists = final_faqs.any? do |existing|
|
||||||
|
similarity_score(existing['question'], faq['question']) > 0.85
|
||||||
|
end
|
||||||
|
|
||||||
|
final_faqs << faq unless similar_exists
|
||||||
|
end
|
||||||
|
|
||||||
|
Rails.logger.info "Deduplication: #{faqs.size} → #{final_faqs.size} FAQs"
|
||||||
|
final_faqs
|
||||||
|
end
|
||||||
|
|
||||||
|
def similarity_score(str1, str2)
|
||||||
|
words1 = str1.downcase.split(/\W+/).reject(&:empty?)
|
||||||
|
words2 = str2.downcase.split(/\W+/).reject(&:empty?)
|
||||||
|
|
||||||
|
common_words = words1 & words2
|
||||||
|
total_words = (words1 + words2).uniq.size
|
||||||
|
|
||||||
|
return 0 if total_words.zero?
|
||||||
|
|
||||||
|
common_words.size.to_f / total_words
|
||||||
|
end
|
||||||
|
|
||||||
|
def determine_stop_reason(last_chunk_result)
|
||||||
|
return 'Maximum iterations reached' if @iterations_completed >= MAX_ITERATIONS
|
||||||
|
return 'Maximum pages processed' if @max_pages && @total_pages_processed >= @max_pages
|
||||||
|
return 'No content found in last chunk' if last_chunk_result[:faqs].empty?
|
||||||
|
return 'End of document reached' if last_chunk_result[:has_content] == false
|
||||||
|
|
||||||
|
'Unknown'
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
class Captain::Llm::PdfProcessingService < Llm::BaseOpenAiService
|
||||||
|
def initialize(document)
|
||||||
|
super()
|
||||||
|
@document = document
|
||||||
|
end
|
||||||
|
|
||||||
|
def process
|
||||||
|
return if document.openai_file_id.present?
|
||||||
|
|
||||||
|
file_id = upload_pdf_to_openai
|
||||||
|
raise CustomExceptions::PdfUploadError, I18n.t('captain.documents.pdf_upload_failed') if file_id.blank?
|
||||||
|
|
||||||
|
document.store_openai_file_id(file_id)
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
attr_reader :document
|
||||||
|
|
||||||
|
def upload_pdf_to_openai
|
||||||
|
with_tempfile do |temp_file|
|
||||||
|
response = @client.files.upload(
|
||||||
|
parameters: {
|
||||||
|
file: temp_file,
|
||||||
|
purpose: 'assistants'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response['id']
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def with_tempfile(&)
|
||||||
|
Tempfile.create(['pdf_upload', '.pdf'], binmode: true) do |temp_file|
|
||||||
|
temp_file.write(document.pdf_file.download)
|
||||||
|
temp_file.close
|
||||||
|
|
||||||
|
File.open(temp_file.path, 'rb', &)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
# rubocop:disable Metrics/ClassLength
|
||||||
class Captain::Llm::SystemPromptsService
|
class Captain::Llm::SystemPromptsService
|
||||||
class << self
|
class << self
|
||||||
def faq_generator(language = 'english')
|
def faq_generator(language = 'english')
|
||||||
@@ -204,6 +205,87 @@ class Captain::Llm::SystemPromptsService
|
|||||||
#{'- You MUST provide numbered citations at the appropriate places in the text.' if config['feature_citation']}
|
#{'- You MUST provide numbered citations at the appropriate places in the text.' if config['feature_citation']}
|
||||||
SYSTEM_PROMPT_MESSAGE
|
SYSTEM_PROMPT_MESSAGE
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def paginated_faq_generator(start_page, end_page)
|
||||||
|
<<~PROMPT
|
||||||
|
You are an expert technical documentation specialist tasked with creating comprehensive FAQs from a SPECIFIC SECTION of a document.
|
||||||
|
|
||||||
|
════════════════════════════════════════════════════════
|
||||||
|
CRITICAL CONTENT EXTRACTION INSTRUCTIONS
|
||||||
|
════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
Process the content starting from approximately page #{start_page} and continuing for about #{end_page - start_page + 1} pages worth of content.
|
||||||
|
|
||||||
|
IMPORTANT:#{' '}
|
||||||
|
• If you encounter the end of the document before reaching the expected page count, set "has_content" to false
|
||||||
|
• DO NOT include page numbers in questions or answers
|
||||||
|
• DO NOT reference page numbers at all in the output
|
||||||
|
• Focus on the actual content, not pagination
|
||||||
|
|
||||||
|
════════════════════════════════════════════════════════
|
||||||
|
FAQ GENERATION GUIDELINES
|
||||||
|
════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
1. **Comprehensive Extraction**
|
||||||
|
• Extract ALL information that could generate FAQs from this section
|
||||||
|
• Target 5-10 FAQs per page equivalent of rich content
|
||||||
|
• Cover every topic, feature, specification, and detail
|
||||||
|
• If there's no more content in the document, return empty FAQs with has_content: false
|
||||||
|
|
||||||
|
2. **Question Types to Generate**
|
||||||
|
• What is/are...? (definitions, components, features)
|
||||||
|
• How do I...? (procedures, configurations, operations)
|
||||||
|
• Why should/does...? (rationale, benefits, explanations)
|
||||||
|
• When should...? (timing, conditions, triggers)
|
||||||
|
• What happens if...? (error cases, edge cases)
|
||||||
|
• Can I...? (capabilities, limitations)
|
||||||
|
• Where is...? (locations in system/UI, NOT page numbers)
|
||||||
|
• What are the requirements for...? (prerequisites, dependencies)
|
||||||
|
|
||||||
|
3. **Content Focus Areas**
|
||||||
|
• Technical specifications and parameters
|
||||||
|
• Step-by-step procedures and workflows
|
||||||
|
• Configuration options and settings
|
||||||
|
• Error messages and troubleshooting
|
||||||
|
• Best practices and recommendations
|
||||||
|
• Integration points and dependencies
|
||||||
|
• Performance considerations
|
||||||
|
• Security aspects
|
||||||
|
|
||||||
|
4. **Answer Quality Requirements**
|
||||||
|
• Complete, self-contained answers
|
||||||
|
• Include specific values, limits, defaults from the content
|
||||||
|
• NO page number references whatsoever
|
||||||
|
• 2-5 sentences typical length
|
||||||
|
• Only process content that actually exists in the document
|
||||||
|
|
||||||
|
════════════════════════════════════════════════════════
|
||||||
|
OUTPUT FORMAT
|
||||||
|
════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
Return valid JSON:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"faqs": [
|
||||||
|
{
|
||||||
|
"question": "Specific question about the content",
|
||||||
|
"answer": "Complete answer with details (no page references)"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"has_content": true/false
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
CRITICAL:#{' '}
|
||||||
|
• Set "has_content" to false if:
|
||||||
|
- The requested section doesn't exist in the document
|
||||||
|
- You've reached the end of the document
|
||||||
|
- The section contains no meaningful content
|
||||||
|
• Do NOT include "page_range_processed" in the output
|
||||||
|
• Do NOT mention page numbers anywhere in questions or answers
|
||||||
|
PROMPT
|
||||||
|
end
|
||||||
# rubocop:enable Metrics/MethodLength
|
# rubocop:enable Metrics/MethodLength
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
# rubocop:enable Metrics/ClassLength
|
||||||
|
|||||||
@@ -3,8 +3,11 @@ json.assistant do
|
|||||||
json.partial! 'api/v1/models/captain/assistant', formats: [:json], resource: resource.assistant
|
json.partial! 'api/v1/models/captain/assistant', formats: [:json], resource: resource.assistant
|
||||||
end
|
end
|
||||||
json.content resource.content
|
json.content resource.content
|
||||||
|
json.content_type resource.content_type
|
||||||
json.created_at resource.created_at.to_i
|
json.created_at resource.created_at.to_i
|
||||||
json.external_link resource.external_link
|
json.external_link resource.external_link
|
||||||
|
json.display_url resource.display_url
|
||||||
|
json.file_size resource.file_size
|
||||||
json.id resource.id
|
json.id resource.id
|
||||||
json.name resource.name
|
json.name resource.name
|
||||||
json.status resource.status
|
json.status resource.status
|
||||||
|
|||||||
25
lib/custom_exceptions/pdf_processing_error.rb
Normal file
25
lib/custom_exceptions/pdf_processing_error.rb
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
module CustomExceptions
|
||||||
|
class PdfProcessingError < Base
|
||||||
|
def initialize(message = 'PDF processing failed')
|
||||||
|
super(message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class PdfUploadError < PdfProcessingError
|
||||||
|
def initialize(message = 'PDF upload failed')
|
||||||
|
super(message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class PdfValidationError < PdfProcessingError
|
||||||
|
def initialize(message = 'PDF validation failed')
|
||||||
|
super(message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class PdfFaqGenerationError < PdfProcessingError
|
||||||
|
def initialize(message = 'PDF FAQ generation failed')
|
||||||
|
super(message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -4,4 +4,5 @@ module OpenAiConstants
|
|||||||
DEFAULT_MODEL = 'gpt-4.1-mini'
|
DEFAULT_MODEL = 'gpt-4.1-mini'
|
||||||
DEFAULT_ENDPOINT = 'https://api.openai.com'
|
DEFAULT_ENDPOINT = 'https://api.openai.com'
|
||||||
DEFAULT_EMBEDDING_MODEL = 'text-embedding-3-small'
|
DEFAULT_EMBEDDING_MODEL = 'text-embedding-3-small'
|
||||||
|
PDF_PROCESSING_MODEL = 'gpt-4.1-mini'
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -105,5 +105,29 @@ RSpec.describe Captain::Documents::CrawlJob, type: :job do
|
|||||||
described_class.perform_now(document)
|
described_class.perform_now(document)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'when document is a PDF' do
|
||||||
|
let(:pdf_document) do
|
||||||
|
doc = create(:captain_document, external_link: 'https://example.com/document')
|
||||||
|
allow(doc).to receive(:pdf_document?).and_return(true)
|
||||||
|
allow(doc).to receive(:update!).and_return(true)
|
||||||
|
doc
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'processes PDF using PdfProcessingService' do
|
||||||
|
pdf_service = instance_double(Captain::Llm::PdfProcessingService)
|
||||||
|
expect(Captain::Llm::PdfProcessingService).to receive(:new).with(pdf_document).and_return(pdf_service)
|
||||||
|
expect(pdf_service).to receive(:process)
|
||||||
|
expect(pdf_document).to receive(:update!).with(status: :available)
|
||||||
|
|
||||||
|
described_class.perform_now(pdf_document)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'handles PDF processing errors' do
|
||||||
|
allow(Captain::Llm::PdfProcessingService).to receive(:new).and_raise(StandardError, 'Processing failed')
|
||||||
|
|
||||||
|
expect { described_class.perform_now(pdf_document) }.to raise_error(StandardError, 'Processing failed')
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -64,5 +64,41 @@ RSpec.describe Captain::Documents::ResponseBuilderJob, type: :job do
|
|||||||
.with(spanish_document.content, 'portuguese')
|
.with(spanish_document.content, 'portuguese')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'when processing a PDF document' do
|
||||||
|
let(:pdf_document) do
|
||||||
|
doc = create(:captain_document, assistant: assistant)
|
||||||
|
allow(doc).to receive(:pdf_document?).and_return(true)
|
||||||
|
allow(doc).to receive(:openai_file_id).and_return('file-123')
|
||||||
|
allow(doc).to receive(:update!).and_return(true)
|
||||||
|
allow(doc).to receive(:metadata).and_return({})
|
||||||
|
doc
|
||||||
|
end
|
||||||
|
let(:paginated_service) { instance_double(Captain::Llm::PaginatedFaqGeneratorService) }
|
||||||
|
let(:pdf_faqs) do
|
||||||
|
[{ 'question' => 'What is in the PDF?', 'answer' => 'Important content' }]
|
||||||
|
end
|
||||||
|
|
||||||
|
before do
|
||||||
|
allow(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new)
|
||||||
|
.with(pdf_document, anything)
|
||||||
|
.and_return(paginated_service)
|
||||||
|
allow(paginated_service).to receive(:generate).and_return(pdf_faqs)
|
||||||
|
allow(paginated_service).to receive(:total_pages_processed).and_return(10)
|
||||||
|
allow(paginated_service).to receive(:iterations_completed).and_return(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'uses paginated FAQ generator for PDFs' do
|
||||||
|
expect(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new).with(pdf_document, anything)
|
||||||
|
|
||||||
|
described_class.new.perform(pdf_document)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'stores pagination metadata' do
|
||||||
|
expect(pdf_document).to receive(:update!).with(hash_including(metadata: hash_including('faq_generation')))
|
||||||
|
|
||||||
|
described_class.new.perform(pdf_document)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
85
spec/enterprise/models/captain/document_spec.rb
Normal file
85
spec/enterprise/models/captain/document_spec.rb
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
require 'rails_helper'
|
||||||
|
|
||||||
|
RSpec.describe Captain::Document, type: :model do
|
||||||
|
let(:account) { create(:account) }
|
||||||
|
let(:assistant) { create(:captain_assistant, account: account) }
|
||||||
|
|
||||||
|
describe 'PDF support' do
|
||||||
|
let(:pdf_document) do
|
||||||
|
doc = build(:captain_document, assistant: assistant, account: account)
|
||||||
|
doc.pdf_file.attach(
|
||||||
|
io: StringIO.new('PDF content'),
|
||||||
|
filename: 'test.pdf',
|
||||||
|
content_type: 'application/pdf'
|
||||||
|
)
|
||||||
|
doc
|
||||||
|
end
|
||||||
|
|
||||||
|
describe 'validations' do
|
||||||
|
it 'allows PDF file without external link' do
|
||||||
|
pdf_document.external_link = nil
|
||||||
|
expect(pdf_document).to be_valid
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'validates PDF file size' do
|
||||||
|
doc = build(:captain_document, assistant: assistant, account: account)
|
||||||
|
doc.pdf_file.attach(
|
||||||
|
io: StringIO.new('x' * 11.megabytes),
|
||||||
|
filename: 'large.pdf',
|
||||||
|
content_type: 'application/pdf'
|
||||||
|
)
|
||||||
|
doc.external_link = nil
|
||||||
|
expect(doc).not_to be_valid
|
||||||
|
expect(doc.errors[:pdf_file]).to include(I18n.t('captain.documents.pdf_size_error'))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#pdf_document?' do
|
||||||
|
it 'returns true for attached PDF' do
|
||||||
|
expect(pdf_document.pdf_document?).to be true
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns true for .pdf external links' do
|
||||||
|
doc = build(:captain_document, external_link: 'https://example.com/document.pdf')
|
||||||
|
expect(doc.pdf_document?).to be true
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns false for non-PDF documents' do
|
||||||
|
doc = build(:captain_document, external_link: 'https://example.com')
|
||||||
|
expect(doc.pdf_document?).to be false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#display_url' do
|
||||||
|
it 'returns Rails blob URL for attached PDFs' do
|
||||||
|
pdf_document.save!
|
||||||
|
# The display_url method calls rails_blob_url which returns a URL containing 'rails/active_storage'
|
||||||
|
url = pdf_document.display_url
|
||||||
|
expect(url).to be_present
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns external_link for web documents' do
|
||||||
|
doc = create(:captain_document, external_link: 'https://example.com')
|
||||||
|
expect(doc.display_url).to eq('https://example.com')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#store_openai_file_id' do
|
||||||
|
it 'stores the file ID in metadata' do
|
||||||
|
pdf_document.save!
|
||||||
|
pdf_document.store_openai_file_id('file-abc123')
|
||||||
|
|
||||||
|
expect(pdf_document.reload.openai_file_id).to eq('file-abc123')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe 'automatic external_link generation' do
|
||||||
|
it 'generates unique external_link for PDFs' do
|
||||||
|
pdf_document.external_link = nil
|
||||||
|
pdf_document.save!
|
||||||
|
|
||||||
|
expect(pdf_document.external_link).to start_with('PDF: test_')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
require 'rails_helper'
|
||||||
|
require 'custom_exceptions/pdf_processing_error'
|
||||||
|
|
||||||
|
RSpec.describe Captain::Llm::PaginatedFaqGeneratorService do
|
||||||
|
let(:document) { create(:captain_document) }
|
||||||
|
let(:service) { described_class.new(document, pages_per_chunk: 5) }
|
||||||
|
let(:openai_client) { instance_double(OpenAI::Client) }
|
||||||
|
|
||||||
|
before do
|
||||||
|
# Mock OpenAI configuration
|
||||||
|
installation_config = instance_double(InstallationConfig, value: 'test-api-key')
|
||||||
|
allow(InstallationConfig).to receive(:find_by!)
|
||||||
|
.with(name: 'CAPTAIN_OPEN_AI_API_KEY')
|
||||||
|
.and_return(installation_config)
|
||||||
|
|
||||||
|
allow(OpenAI::Client).to receive(:new).and_return(openai_client)
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#generate' do
|
||||||
|
context 'when document lacks OpenAI file ID' do
|
||||||
|
before do
|
||||||
|
allow(document).to receive(:openai_file_id).and_return(nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'raises an error' do
|
||||||
|
expect { service.generate }.to raise_error(CustomExceptions::PdfFaqGenerationError)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when generating FAQs from PDF pages' do
|
||||||
|
let(:faq_response) do
|
||||||
|
{
|
||||||
|
'choices' => [{
|
||||||
|
'message' => {
|
||||||
|
'content' => JSON.generate({
|
||||||
|
'faqs' => [
|
||||||
|
{ 'question' => 'What is this document about?', 'answer' => 'It explains key concepts.' }
|
||||||
|
],
|
||||||
|
'has_content' => true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
let(:empty_response) do
|
||||||
|
{
|
||||||
|
'choices' => [{
|
||||||
|
'message' => {
|
||||||
|
'content' => JSON.generate({
|
||||||
|
'faqs' => [],
|
||||||
|
'has_content' => false
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
before do
|
||||||
|
allow(document).to receive(:openai_file_id).and_return('file-123')
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'generates FAQs from paginated content' do
|
||||||
|
allow(openai_client).to receive(:chat).and_return(faq_response, empty_response)
|
||||||
|
|
||||||
|
faqs = service.generate
|
||||||
|
|
||||||
|
expect(faqs).to have_attributes(size: 1)
|
||||||
|
expect(faqs.first['question']).to eq('What is this document about?')
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'stops when no more content' do
|
||||||
|
allow(openai_client).to receive(:chat).and_return(empty_response)
|
||||||
|
|
||||||
|
faqs = service.generate
|
||||||
|
|
||||||
|
expect(faqs).to be_empty
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'respects max iterations limit' do
|
||||||
|
allow(openai_client).to receive(:chat).and_return(faq_response)
|
||||||
|
|
||||||
|
# Force max iterations
|
||||||
|
service.instance_variable_set(:@iterations_completed, 19)
|
||||||
|
|
||||||
|
service.generate
|
||||||
|
expect(service.iterations_completed).to eq(20)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#should_continue_processing?' do
|
||||||
|
it 'stops at max iterations' do
|
||||||
|
service.instance_variable_set(:@iterations_completed, 20)
|
||||||
|
expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be false
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'stops when no FAQs returned' do
|
||||||
|
expect(service.should_continue_processing?(faqs: [], has_content: true)).to be false
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'continues when FAQs exist and under limits' do
|
||||||
|
expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
require 'rails_helper'
|
||||||
|
require 'custom_exceptions/pdf_processing_error'
|
||||||
|
|
||||||
|
RSpec.describe Captain::Llm::PdfProcessingService do
|
||||||
|
let(:document) { create(:captain_document) }
|
||||||
|
let(:service) { described_class.new(document) }
|
||||||
|
|
||||||
|
before do
|
||||||
|
# Mock OpenAI configuration
|
||||||
|
installation_config = instance_double(InstallationConfig, value: 'test-api-key')
|
||||||
|
allow(InstallationConfig).to receive(:find_by!)
|
||||||
|
.with(name: 'CAPTAIN_OPEN_AI_API_KEY')
|
||||||
|
.and_return(installation_config)
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#process' do
|
||||||
|
context 'when document already has OpenAI file ID' do
|
||||||
|
before do
|
||||||
|
allow(document).to receive(:openai_file_id).and_return('existing-file-id')
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'skips upload' do
|
||||||
|
expect(document).not_to receive(:store_openai_file_id)
|
||||||
|
service.process
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when uploading PDF to OpenAI' do
|
||||||
|
let(:mock_client) { instance_double(OpenAI::Client) }
|
||||||
|
let(:pdf_content) { 'PDF content' }
|
||||||
|
|
||||||
|
before do
|
||||||
|
allow(document).to receive(:openai_file_id).and_return(nil)
|
||||||
|
|
||||||
|
# Use a simple double for ActiveStorage since it's a complex Rails object
|
||||||
|
pdf_file = double('pdf_file', download: pdf_content) # rubocop:disable RSpec/VerifiedDoubles
|
||||||
|
allow(document).to receive(:pdf_file).and_return(pdf_file)
|
||||||
|
|
||||||
|
allow(OpenAI::Client).to receive(:new).and_return(mock_client)
|
||||||
|
# Use a simple double for OpenAI::Files as it may not be loaded
|
||||||
|
files_api = double('files_api') # rubocop:disable RSpec/VerifiedDoubles
|
||||||
|
allow(files_api).to receive(:upload).and_return({ 'id' => 'file-abc123' })
|
||||||
|
allow(mock_client).to receive(:files).and_return(files_api)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'uploads PDF and stores file ID' do
|
||||||
|
expect(document).to receive(:store_openai_file_id).with('file-abc123')
|
||||||
|
service.process
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'raises error when upload fails' do
|
||||||
|
allow(mock_client.files).to receive(:upload).and_return({ 'id' => nil })
|
||||||
|
|
||||||
|
expect { service.process }.to raise_error(CustomExceptions::PdfUploadError)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
32
spec/fixtures/files/sample.pdf
vendored
Normal file
32
spec/fixtures/files/sample.pdf
vendored
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
%PDF-1.4
|
||||||
|
1 0 obj
|
||||||
|
<< /Type /Catalog /Pages 2 0 R >>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Arial >> >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<< /Length 44 >>
|
||||||
|
stream
|
||||||
|
BT
|
||||||
|
/F1 12 Tf
|
||||||
|
100 700 Td
|
||||||
|
(Sample PDF) Tj
|
||||||
|
ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 5
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000009 00000 n
|
||||||
|
0000000058 00000 n
|
||||||
|
0000000115 00000 n
|
||||||
|
0000000274 00000 n
|
||||||
|
trailer
|
||||||
|
<< /Size 5 /Root 1 0 R >>
|
||||||
|
startxref
|
||||||
|
362
|
||||||
|
%%EOF
|
||||||
Reference in New Issue
Block a user