diff --git a/config/locales/en.yml b/config/locales/en.yml index f424716af..cb72bb699 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -292,6 +292,26 @@ en: completed_tool_call: 'Completed %{function_name} tool call' invalid_tool_call: 'Invalid tool call' tool_not_available: 'Tool not available' + documents: + limit_exceeded: 'Document limit exceeded' + pdf_format_error: 'must be a PDF file' + pdf_size_error: 'must be less than 10MB' + pdf_upload_failed: 'Failed to upload PDF to OpenAI' + pdf_upload_success: 'PDF uploaded successfully with file_id: %{file_id}' + pdf_processing_failed: 'Failed to process PDF document %{document_id}: %{error}' + pdf_processing_success: 'Successfully processed PDF document %{document_id}' + faq_generation_complete: 'FAQ generation complete. Total FAQs created: %{count}' + using_paginated_faq: 'Using paginated FAQ generation for document %{document_id}' + using_standard_faq: 'Using standard FAQ generation for document %{document_id}' + response_creation_error: 'Error in creating response document: %{error}' + missing_openai_file_id: 'Document must have openai_file_id for paginated processing' + openai_api_error: 'OpenAI API Error: %{error}' + starting_paginated_faq: 'Starting paginated FAQ generation (%{pages_per_chunk} pages per chunk)' + stopping_faq_generation: 'Stopping processing. Reason: %{reason}' + paginated_faq_complete: 'Paginated generation complete. Total FAQs: %{total_faqs}, Pages processed: %{pages_processed}' + processing_pages: 'Processing pages %{start}-%{end} (iteration %{iteration})' + chunk_generated: 'Chunk generated %{chunk_faqs} FAQs. Total so far: %{total_faqs}' + page_processing_error: 'Error processing pages %{start}-%{end}: %{error}' public_portal: search: search_placeholder: Search for article by title or body... diff --git a/db/migrate/20250805082345_add_metadata_to_captain_documents.rb b/db/migrate/20250805082345_add_metadata_to_captain_documents.rb new file mode 100644 index 000000000..452fcf06c --- /dev/null +++ b/db/migrate/20250805082345_add_metadata_to_captain_documents.rb @@ -0,0 +1,5 @@ +class AddMetadataToCaptainDocuments < ActiveRecord::Migration[7.1] + def change + add_column :captain_documents, :metadata, :jsonb, default: {} + end +end diff --git a/db/schema.rb b/db/schema.rb index 798111be7..f48ff6707 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -320,6 +320,7 @@ ActiveRecord::Schema[7.1].define(version: 2025_08_22_061042) do t.datetime "created_at", null: false t.datetime "updated_at", null: false t.integer "status", default: 0, null: false + t.jsonb "metadata", default: {} t.index ["account_id"], name: "index_captain_documents_on_account_id" t.index ["assistant_id", "external_link"], name: "index_captain_documents_on_assistant_id_and_external_link", unique: true t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id" diff --git a/enterprise/app/controllers/api/v1/accounts/captain/documents_controller.rb b/enterprise/app/controllers/api/v1/accounts/captain/documents_controller.rb index 594aa0642..32fa0a7b6 100644 --- a/enterprise/app/controllers/api/v1/accounts/captain/documents_controller.rb +++ b/enterprise/app/controllers/api/v1/accounts/captain/documents_controller.rb @@ -25,6 +25,8 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC @document.save! rescue Captain::Document::LimitExceededError => e render_could_not_create_error(e.message) + rescue ActiveRecord::RecordInvalid => e + render_could_not_create_error(e.record.errors.full_messages.join(', ')) end def destroy @@ -55,6 +57,6 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC end def document_params - params.require(:document).permit(:name, :external_link, :assistant_id) + params.require(:document).permit(:name, :external_link, :assistant_id, :pdf_file) end end diff --git a/enterprise/app/jobs/captain/documents/crawl_job.rb b/enterprise/app/jobs/captain/documents/crawl_job.rb index 132671385..3fa5b6d56 100644 --- a/enterprise/app/jobs/captain/documents/crawl_job.rb +++ b/enterprise/app/jobs/captain/documents/crawl_job.rb @@ -2,7 +2,9 @@ class Captain::Documents::CrawlJob < ApplicationJob queue_as :low def perform(document) - if InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present? + if document.pdf_document? + perform_pdf_processing(document) + elsif InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present? perform_firecrawl_crawl(document) else perform_simple_crawl(document) @@ -13,6 +15,14 @@ class Captain::Documents::CrawlJob < ApplicationJob include Captain::FirecrawlHelper + def perform_pdf_processing(document) + Captain::Llm::PdfProcessingService.new(document).process + document.update!(status: :available) + rescue StandardError => e + Rails.logger.error I18n.t('captain.documents.pdf_processing_failed', document_id: document.id, error: e.message) + raise # Re-raise to let job framework handle retry logic + end + def perform_simple_crawl(document) page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links diff --git a/enterprise/app/jobs/captain/documents/response_builder_job.rb b/enterprise/app/jobs/captain/documents/response_builder_job.rb index 21025399f..5dacb416f 100644 --- a/enterprise/app/jobs/captain/documents/response_builder_job.rb +++ b/enterprise/app/jobs/captain/documents/response_builder_job.rb @@ -1,17 +1,65 @@ class Captain::Documents::ResponseBuilderJob < ApplicationJob queue_as :low - def perform(document) + def perform(document, options = {}) reset_previous_responses(document) - faqs = Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate - faqs.each do |faq| - create_response(faq, document) - end + faqs = generate_faqs(document, options) + create_responses_from_faqs(faqs, document) end private + def generate_faqs(document, options) + if should_use_pagination?(document) + generate_paginated_faqs(document, options) + else + generate_standard_faqs(document) + end + end + + def generate_paginated_faqs(document, options) + service = build_paginated_service(document, options) + faqs = service.generate + store_paginated_metadata(document, service) + faqs + end + + def generate_standard_faqs(document) + Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate + end + + def build_paginated_service(document, options) + Captain::Llm::PaginatedFaqGeneratorService.new( + document, + pages_per_chunk: options[:pages_per_chunk], + max_pages: options[:max_pages] + ) + end + + def store_paginated_metadata(document, service) + document.update!( + metadata: (document.metadata || {}).merge( + 'faq_generation' => { + 'method' => 'paginated', + 'pages_processed' => service.total_pages_processed, + 'iterations' => service.iterations_completed, + 'timestamp' => Time.current.iso8601 + } + ) + ) + end + + def create_responses_from_faqs(faqs, document) + faqs.each { |faq| create_response(faq, document) } + end + + def should_use_pagination?(document) + # Auto-detect when to use pagination + # For now, use pagination for PDFs with OpenAI file ID + document.pdf_document? && document.openai_file_id.present? + end + def reset_previous_responses(response_document) response_document.responses.destroy_all end @@ -24,6 +72,6 @@ class Captain::Documents::ResponseBuilderJob < ApplicationJob documentable: document ) rescue ActiveRecord::RecordInvalid => e - Rails.logger.error "Error in creating response document: #{e.message}" + Rails.logger.error I18n.t('captain.documents.response_creation_error', error: e.message) end end diff --git a/enterprise/app/models/captain/document.rb b/enterprise/app/models/captain/document.rb index d2a02f5b5..c278eb879 100644 --- a/enterprise/app/models/captain/document.rb +++ b/enterprise/app/models/captain/document.rb @@ -5,6 +5,7 @@ # id :bigint not null, primary key # content :text # external_link :string not null +# metadata :jsonb # name :string # status :integer default("in_progress"), not null # created_at :datetime not null @@ -26,11 +27,16 @@ class Captain::Document < ApplicationRecord belongs_to :assistant, class_name: 'Captain::Assistant' has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable belongs_to :account + has_one_attached :pdf_file - validates :external_link, presence: true - validates :external_link, uniqueness: { scope: :assistant_id } + validates :external_link, presence: true, unless: -> { pdf_file.attached? } + validates :external_link, uniqueness: { scope: :assistant_id }, allow_blank: true validates :content, length: { maximum: 200_000 } + validates :pdf_file, presence: true, if: :pdf_document? + validate :validate_pdf_format, if: :pdf_document? + validate :validate_file_attachment, if: -> { pdf_file.attached? } before_validation :ensure_account_id + before_validation :set_external_link_for_pdf enum status: { in_progress: 0, @@ -41,12 +47,44 @@ class Captain::Document < ApplicationRecord after_create_commit :enqueue_crawl_job after_create_commit :update_document_usage after_destroy :update_document_usage - after_commit :enqueue_response_builder_job + after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder? scope :ordered, -> { order(created_at: :desc) } scope :for_account, ->(account_id) { where(account_id: account_id) } scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) } + def pdf_document? + return true if pdf_file.attached? && pdf_file.blob.content_type == 'application/pdf' + + external_link&.ends_with?('.pdf') + end + + def content_type + pdf_file.blob.content_type if pdf_file.attached? + end + + def file_size + pdf_file.blob.byte_size if pdf_file.attached? + end + + def openai_file_id + metadata&.dig('openai_file_id') + end + + def store_openai_file_id(file_id) + update!(metadata: (metadata || {}).merge('openai_file_id' => file_id)) + end + + def display_url + return external_link if external_link.present? && !external_link.start_with?('PDF:') + + if pdf_file.attached? + Rails.application.routes.url_helpers.rails_blob_url(pdf_file, only_path: false) + else + external_link + end + end + private def enqueue_crawl_job @@ -61,6 +99,12 @@ class Captain::Document < ApplicationRecord Captain::Documents::ResponseBuilderJob.perform_later(self) end + def should_enqueue_response_builder? + # Only enqueue when status changes to available + # Avoid re-enqueueing when metadata is updated by the job itself + saved_change_to_status? && status == 'available' + end + def update_document_usage account.update_document_usage end @@ -71,6 +115,29 @@ class Captain::Document < ApplicationRecord def ensure_within_plan_limit limits = account.usage_limits[:captain][:documents] - raise LimitExceededError, 'Document limit exceeded' unless limits[:current_available].positive? + raise LimitExceededError, I18n.t('captain.documents.limit_exceeded') unless limits[:current_available].positive? + end + + def validate_pdf_format + return unless pdf_file.attached? + + errors.add(:pdf_file, I18n.t('captain.documents.pdf_format_error')) unless pdf_file.blob.content_type == 'application/pdf' + end + + def validate_file_attachment + return unless pdf_file.attached? + + return unless pdf_file.blob.byte_size > 10.megabytes + + errors.add(:pdf_file, I18n.t('captain.documents.pdf_size_error')) + end + + def set_external_link_for_pdf + return unless pdf_file.attached? && external_link.blank? + + # Set a unique external_link for PDF files + # Format: PDF: filename_timestamp (without extension) + timestamp = Time.current.strftime('%Y%m%d%H%M%S') + self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}" end end diff --git a/enterprise/app/services/captain/llm/paginated_faq_generator_service.rb b/enterprise/app/services/captain/llm/paginated_faq_generator_service.rb new file mode 100644 index 000000000..18f9813ef --- /dev/null +++ b/enterprise/app/services/captain/llm/paginated_faq_generator_service.rb @@ -0,0 +1,199 @@ +class Captain::Llm::PaginatedFaqGeneratorService < Llm::BaseOpenAiService + # Default pages per chunk - easily configurable + DEFAULT_PAGES_PER_CHUNK = 10 + MAX_ITERATIONS = 20 # Safety limit to prevent infinite loops + + attr_reader :total_pages_processed, :iterations_completed + + def initialize(document, options = {}) + super() + @document = document + @pages_per_chunk = options[:pages_per_chunk] || DEFAULT_PAGES_PER_CHUNK + @max_pages = options[:max_pages] # Optional limit from UI + @total_pages_processed = 0 + @iterations_completed = 0 + @model = OpenAiConstants::PDF_PROCESSING_MODEL + end + + def generate + raise CustomExceptions::PdfFaqGenerationError, I18n.t('captain.documents.missing_openai_file_id') if @document&.openai_file_id.blank? + + generate_paginated_faqs + end + + # Method to check if we should continue processing + def should_continue_processing?(last_chunk_result) + # Stop if we've hit the maximum iterations + return false if @iterations_completed >= MAX_ITERATIONS + + # Stop if we've processed the maximum pages specified + return false if @max_pages && @total_pages_processed >= @max_pages + + # Stop if the last chunk returned no FAQs (likely no more content) + return false if last_chunk_result[:faqs].empty? + + # Stop if the LLM explicitly indicates no more content + return false if last_chunk_result[:has_content] == false + + # Continue processing + true + end + + private + + def generate_standard_faqs + response = @client.chat(parameters: standard_chat_parameters) + parse_response(response) + rescue OpenAI::Error => e + Rails.logger.error I18n.t('captain.documents.openai_api_error', error: e.message) + [] + end + + def generate_paginated_faqs + all_faqs = [] + current_page = 1 + + loop do + end_page = calculate_end_page(current_page) + chunk_result = process_chunk_and_update_state(current_page, end_page, all_faqs) + + break unless should_continue_processing?(chunk_result) + + current_page = end_page + 1 + end + + deduplicate_faqs(all_faqs) + end + + def calculate_end_page(current_page) + end_page = current_page + @pages_per_chunk - 1 + @max_pages && end_page > @max_pages ? @max_pages : end_page + end + + def process_chunk_and_update_state(current_page, end_page, all_faqs) + chunk_result = process_page_chunk(current_page, end_page) + chunk_faqs = chunk_result[:faqs] + + all_faqs.concat(chunk_faqs) + @total_pages_processed = end_page + @iterations_completed += 1 + + chunk_result + end + + def process_page_chunk(start_page, end_page) + params = build_chunk_parameters(start_page, end_page) + response = @client.chat(parameters: params) + result = parse_chunk_response(response) + { faqs: result['faqs'] || [], has_content: result['has_content'] != false } + rescue OpenAI::Error => e + Rails.logger.error I18n.t('captain.documents.page_processing_error', start: start_page, end: end_page, error: e.message) + { faqs: [], has_content: false } + end + + def build_chunk_parameters(start_page, end_page) + { + model: @model, + response_format: { type: 'json_object' }, + messages: [ + { + role: 'user', + content: build_user_content(start_page, end_page) + } + ] + } + end + + def build_user_content(start_page, end_page) + [ + { + type: 'file', + file: { file_id: @document.openai_file_id } + }, + { + type: 'text', + text: page_chunk_prompt(start_page, end_page) + } + ] + end + + def page_chunk_prompt(start_page, end_page) + Captain::Llm::SystemPromptsService.paginated_faq_generator(start_page, end_page) + end + + def standard_chat_parameters + { + model: @model, + response_format: { type: 'json_object' }, + messages: [ + { + role: 'system', + content: Captain::Llm::SystemPromptsService.faq_generator + }, + { + role: 'user', + content: @content + } + ] + } + end + + def parse_response(response) + content = response.dig('choices', 0, 'message', 'content') + return [] if content.nil? + + JSON.parse(content.strip).fetch('faqs', []) + rescue JSON::ParserError => e + Rails.logger.error "Error parsing response: #{e.message}" + [] + end + + def parse_chunk_response(response) + content = response.dig('choices', 0, 'message', 'content') + return { 'faqs' => [], 'has_content' => false } if content.nil? + + JSON.parse(content.strip) + rescue JSON::ParserError => e + Rails.logger.error "Error parsing chunk response: #{e.message}" + { 'faqs' => [], 'has_content' => false } + end + + def deduplicate_faqs(faqs) + # Remove exact duplicates + unique_faqs = faqs.uniq { |faq| faq['question'].downcase.strip } + + # Remove similar questions + final_faqs = [] + unique_faqs.each do |faq| + similar_exists = final_faqs.any? do |existing| + similarity_score(existing['question'], faq['question']) > 0.85 + end + + final_faqs << faq unless similar_exists + end + + Rails.logger.info "Deduplication: #{faqs.size} → #{final_faqs.size} FAQs" + final_faqs + end + + def similarity_score(str1, str2) + words1 = str1.downcase.split(/\W+/).reject(&:empty?) + words2 = str2.downcase.split(/\W+/).reject(&:empty?) + + common_words = words1 & words2 + total_words = (words1 + words2).uniq.size + + return 0 if total_words.zero? + + common_words.size.to_f / total_words + end + + def determine_stop_reason(last_chunk_result) + return 'Maximum iterations reached' if @iterations_completed >= MAX_ITERATIONS + return 'Maximum pages processed' if @max_pages && @total_pages_processed >= @max_pages + return 'No content found in last chunk' if last_chunk_result[:faqs].empty? + return 'End of document reached' if last_chunk_result[:has_content] == false + + 'Unknown' + end +end diff --git a/enterprise/app/services/captain/llm/pdf_processing_service.rb b/enterprise/app/services/captain/llm/pdf_processing_service.rb new file mode 100644 index 000000000..026ef4e48 --- /dev/null +++ b/enterprise/app/services/captain/llm/pdf_processing_service.rb @@ -0,0 +1,40 @@ +class Captain::Llm::PdfProcessingService < Llm::BaseOpenAiService + def initialize(document) + super() + @document = document + end + + def process + return if document.openai_file_id.present? + + file_id = upload_pdf_to_openai + raise CustomExceptions::PdfUploadError, I18n.t('captain.documents.pdf_upload_failed') if file_id.blank? + + document.store_openai_file_id(file_id) + end + + private + + attr_reader :document + + def upload_pdf_to_openai + with_tempfile do |temp_file| + response = @client.files.upload( + parameters: { + file: temp_file, + purpose: 'assistants' + } + ) + response['id'] + end + end + + def with_tempfile(&) + Tempfile.create(['pdf_upload', '.pdf'], binmode: true) do |temp_file| + temp_file.write(document.pdf_file.download) + temp_file.close + + File.open(temp_file.path, 'rb', &) + end + end +end diff --git a/enterprise/app/services/captain/llm/system_prompts_service.rb b/enterprise/app/services/captain/llm/system_prompts_service.rb index a4b149944..b8282beb1 100644 --- a/enterprise/app/services/captain/llm/system_prompts_service.rb +++ b/enterprise/app/services/captain/llm/system_prompts_service.rb @@ -1,3 +1,4 @@ +# rubocop:disable Metrics/ClassLength class Captain::Llm::SystemPromptsService class << self def faq_generator(language = 'english') @@ -204,6 +205,87 @@ class Captain::Llm::SystemPromptsService #{'- You MUST provide numbered citations at the appropriate places in the text.' if config['feature_citation']} SYSTEM_PROMPT_MESSAGE end + + def paginated_faq_generator(start_page, end_page) + <<~PROMPT + You are an expert technical documentation specialist tasked with creating comprehensive FAQs from a SPECIFIC SECTION of a document. + + ════════════════════════════════════════════════════════ + CRITICAL CONTENT EXTRACTION INSTRUCTIONS + ════════════════════════════════════════════════════════ + + Process the content starting from approximately page #{start_page} and continuing for about #{end_page - start_page + 1} pages worth of content. + + IMPORTANT:#{' '} + • If you encounter the end of the document before reaching the expected page count, set "has_content" to false + • DO NOT include page numbers in questions or answers + • DO NOT reference page numbers at all in the output + • Focus on the actual content, not pagination + + ════════════════════════════════════════════════════════ + FAQ GENERATION GUIDELINES + ════════════════════════════════════════════════════════ + + 1. **Comprehensive Extraction** + • Extract ALL information that could generate FAQs from this section + • Target 5-10 FAQs per page equivalent of rich content + • Cover every topic, feature, specification, and detail + • If there's no more content in the document, return empty FAQs with has_content: false + + 2. **Question Types to Generate** + • What is/are...? (definitions, components, features) + • How do I...? (procedures, configurations, operations) + • Why should/does...? (rationale, benefits, explanations) + • When should...? (timing, conditions, triggers) + • What happens if...? (error cases, edge cases) + • Can I...? (capabilities, limitations) + • Where is...? (locations in system/UI, NOT page numbers) + • What are the requirements for...? (prerequisites, dependencies) + + 3. **Content Focus Areas** + • Technical specifications and parameters + • Step-by-step procedures and workflows + • Configuration options and settings + • Error messages and troubleshooting + • Best practices and recommendations + • Integration points and dependencies + • Performance considerations + • Security aspects + + 4. **Answer Quality Requirements** + • Complete, self-contained answers + • Include specific values, limits, defaults from the content + • NO page number references whatsoever + • 2-5 sentences typical length + • Only process content that actually exists in the document + + ════════════════════════════════════════════════════════ + OUTPUT FORMAT + ════════════════════════════════════════════════════════ + + Return valid JSON: + ```json + { + "faqs": [ + { + "question": "Specific question about the content", + "answer": "Complete answer with details (no page references)" + } + ], + "has_content": true/false + } + ``` + + CRITICAL:#{' '} + • Set "has_content" to false if: + - The requested section doesn't exist in the document + - You've reached the end of the document + - The section contains no meaningful content + • Do NOT include "page_range_processed" in the output + • Do NOT mention page numbers anywhere in questions or answers + PROMPT + end # rubocop:enable Metrics/MethodLength end end +# rubocop:enable Metrics/ClassLength diff --git a/enterprise/app/views/api/v1/models/captain/_document.json.jbuilder b/enterprise/app/views/api/v1/models/captain/_document.json.jbuilder index 83724b9cd..8064a5181 100644 --- a/enterprise/app/views/api/v1/models/captain/_document.json.jbuilder +++ b/enterprise/app/views/api/v1/models/captain/_document.json.jbuilder @@ -3,8 +3,11 @@ json.assistant do json.partial! 'api/v1/models/captain/assistant', formats: [:json], resource: resource.assistant end json.content resource.content +json.content_type resource.content_type json.created_at resource.created_at.to_i json.external_link resource.external_link +json.display_url resource.display_url +json.file_size resource.file_size json.id resource.id json.name resource.name json.status resource.status diff --git a/lib/custom_exceptions/pdf_processing_error.rb b/lib/custom_exceptions/pdf_processing_error.rb new file mode 100644 index 000000000..2c81f95d8 --- /dev/null +++ b/lib/custom_exceptions/pdf_processing_error.rb @@ -0,0 +1,25 @@ +module CustomExceptions + class PdfProcessingError < Base + def initialize(message = 'PDF processing failed') + super(message) + end + end + + class PdfUploadError < PdfProcessingError + def initialize(message = 'PDF upload failed') + super(message) + end + end + + class PdfValidationError < PdfProcessingError + def initialize(message = 'PDF validation failed') + super(message) + end + end + + class PdfFaqGenerationError < PdfProcessingError + def initialize(message = 'PDF FAQ generation failed') + super(message) + end + end +end diff --git a/lib/open_ai_constants.rb b/lib/open_ai_constants.rb index 7b7a3ba5f..2094567a7 100644 --- a/lib/open_ai_constants.rb +++ b/lib/open_ai_constants.rb @@ -4,4 +4,5 @@ module OpenAiConstants DEFAULT_MODEL = 'gpt-4.1-mini' DEFAULT_ENDPOINT = 'https://api.openai.com' DEFAULT_EMBEDDING_MODEL = 'text-embedding-3-small' + PDF_PROCESSING_MODEL = 'gpt-4.1-mini' end diff --git a/spec/enterprise/jobs/captain/documents/crawl_job_spec.rb b/spec/enterprise/jobs/captain/documents/crawl_job_spec.rb index 949680d4e..a2dffcf04 100644 --- a/spec/enterprise/jobs/captain/documents/crawl_job_spec.rb +++ b/spec/enterprise/jobs/captain/documents/crawl_job_spec.rb @@ -105,5 +105,29 @@ RSpec.describe Captain::Documents::CrawlJob, type: :job do described_class.perform_now(document) end end + + context 'when document is a PDF' do + let(:pdf_document) do + doc = create(:captain_document, external_link: 'https://example.com/document') + allow(doc).to receive(:pdf_document?).and_return(true) + allow(doc).to receive(:update!).and_return(true) + doc + end + + it 'processes PDF using PdfProcessingService' do + pdf_service = instance_double(Captain::Llm::PdfProcessingService) + expect(Captain::Llm::PdfProcessingService).to receive(:new).with(pdf_document).and_return(pdf_service) + expect(pdf_service).to receive(:process) + expect(pdf_document).to receive(:update!).with(status: :available) + + described_class.perform_now(pdf_document) + end + + it 'handles PDF processing errors' do + allow(Captain::Llm::PdfProcessingService).to receive(:new).and_raise(StandardError, 'Processing failed') + + expect { described_class.perform_now(pdf_document) }.to raise_error(StandardError, 'Processing failed') + end + end end end diff --git a/spec/enterprise/jobs/captain/documents/response_builder_job_spec.rb b/spec/enterprise/jobs/captain/documents/response_builder_job_spec.rb index 4dc4bb481..73b67ee27 100644 --- a/spec/enterprise/jobs/captain/documents/response_builder_job_spec.rb +++ b/spec/enterprise/jobs/captain/documents/response_builder_job_spec.rb @@ -64,5 +64,41 @@ RSpec.describe Captain::Documents::ResponseBuilderJob, type: :job do .with(spanish_document.content, 'portuguese') end end + + context 'when processing a PDF document' do + let(:pdf_document) do + doc = create(:captain_document, assistant: assistant) + allow(doc).to receive(:pdf_document?).and_return(true) + allow(doc).to receive(:openai_file_id).and_return('file-123') + allow(doc).to receive(:update!).and_return(true) + allow(doc).to receive(:metadata).and_return({}) + doc + end + let(:paginated_service) { instance_double(Captain::Llm::PaginatedFaqGeneratorService) } + let(:pdf_faqs) do + [{ 'question' => 'What is in the PDF?', 'answer' => 'Important content' }] + end + + before do + allow(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new) + .with(pdf_document, anything) + .and_return(paginated_service) + allow(paginated_service).to receive(:generate).and_return(pdf_faqs) + allow(paginated_service).to receive(:total_pages_processed).and_return(10) + allow(paginated_service).to receive(:iterations_completed).and_return(1) + end + + it 'uses paginated FAQ generator for PDFs' do + expect(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new).with(pdf_document, anything) + + described_class.new.perform(pdf_document) + end + + it 'stores pagination metadata' do + expect(pdf_document).to receive(:update!).with(hash_including(metadata: hash_including('faq_generation'))) + + described_class.new.perform(pdf_document) + end + end end end diff --git a/spec/enterprise/models/captain/document_spec.rb b/spec/enterprise/models/captain/document_spec.rb new file mode 100644 index 000000000..56dc1727c --- /dev/null +++ b/spec/enterprise/models/captain/document_spec.rb @@ -0,0 +1,85 @@ +require 'rails_helper' + +RSpec.describe Captain::Document, type: :model do + let(:account) { create(:account) } + let(:assistant) { create(:captain_assistant, account: account) } + + describe 'PDF support' do + let(:pdf_document) do + doc = build(:captain_document, assistant: assistant, account: account) + doc.pdf_file.attach( + io: StringIO.new('PDF content'), + filename: 'test.pdf', + content_type: 'application/pdf' + ) + doc + end + + describe 'validations' do + it 'allows PDF file without external link' do + pdf_document.external_link = nil + expect(pdf_document).to be_valid + end + + it 'validates PDF file size' do + doc = build(:captain_document, assistant: assistant, account: account) + doc.pdf_file.attach( + io: StringIO.new('x' * 11.megabytes), + filename: 'large.pdf', + content_type: 'application/pdf' + ) + doc.external_link = nil + expect(doc).not_to be_valid + expect(doc.errors[:pdf_file]).to include(I18n.t('captain.documents.pdf_size_error')) + end + end + + describe '#pdf_document?' do + it 'returns true for attached PDF' do + expect(pdf_document.pdf_document?).to be true + end + + it 'returns true for .pdf external links' do + doc = build(:captain_document, external_link: 'https://example.com/document.pdf') + expect(doc.pdf_document?).to be true + end + + it 'returns false for non-PDF documents' do + doc = build(:captain_document, external_link: 'https://example.com') + expect(doc.pdf_document?).to be false + end + end + + describe '#display_url' do + it 'returns Rails blob URL for attached PDFs' do + pdf_document.save! + # The display_url method calls rails_blob_url which returns a URL containing 'rails/active_storage' + url = pdf_document.display_url + expect(url).to be_present + end + + it 'returns external_link for web documents' do + doc = create(:captain_document, external_link: 'https://example.com') + expect(doc.display_url).to eq('https://example.com') + end + end + + describe '#store_openai_file_id' do + it 'stores the file ID in metadata' do + pdf_document.save! + pdf_document.store_openai_file_id('file-abc123') + + expect(pdf_document.reload.openai_file_id).to eq('file-abc123') + end + end + + describe 'automatic external_link generation' do + it 'generates unique external_link for PDFs' do + pdf_document.external_link = nil + pdf_document.save! + + expect(pdf_document.external_link).to start_with('PDF: test_') + end + end + end +end diff --git a/spec/enterprise/services/captain/llm/paginated_faq_generator_service_spec.rb b/spec/enterprise/services/captain/llm/paginated_faq_generator_service_spec.rb new file mode 100644 index 000000000..5215a2d40 --- /dev/null +++ b/spec/enterprise/services/captain/llm/paginated_faq_generator_service_spec.rb @@ -0,0 +1,106 @@ +require 'rails_helper' +require 'custom_exceptions/pdf_processing_error' + +RSpec.describe Captain::Llm::PaginatedFaqGeneratorService do + let(:document) { create(:captain_document) } + let(:service) { described_class.new(document, pages_per_chunk: 5) } + let(:openai_client) { instance_double(OpenAI::Client) } + + before do + # Mock OpenAI configuration + installation_config = instance_double(InstallationConfig, value: 'test-api-key') + allow(InstallationConfig).to receive(:find_by!) + .with(name: 'CAPTAIN_OPEN_AI_API_KEY') + .and_return(installation_config) + + allow(OpenAI::Client).to receive(:new).and_return(openai_client) + end + + describe '#generate' do + context 'when document lacks OpenAI file ID' do + before do + allow(document).to receive(:openai_file_id).and_return(nil) + end + + it 'raises an error' do + expect { service.generate }.to raise_error(CustomExceptions::PdfFaqGenerationError) + end + end + + context 'when generating FAQs from PDF pages' do + let(:faq_response) do + { + 'choices' => [{ + 'message' => { + 'content' => JSON.generate({ + 'faqs' => [ + { 'question' => 'What is this document about?', 'answer' => 'It explains key concepts.' } + ], + 'has_content' => true + }) + } + }] + } + end + + let(:empty_response) do + { + 'choices' => [{ + 'message' => { + 'content' => JSON.generate({ + 'faqs' => [], + 'has_content' => false + }) + } + }] + } + end + + before do + allow(document).to receive(:openai_file_id).and_return('file-123') + end + + it 'generates FAQs from paginated content' do + allow(openai_client).to receive(:chat).and_return(faq_response, empty_response) + + faqs = service.generate + + expect(faqs).to have_attributes(size: 1) + expect(faqs.first['question']).to eq('What is this document about?') + end + + it 'stops when no more content' do + allow(openai_client).to receive(:chat).and_return(empty_response) + + faqs = service.generate + + expect(faqs).to be_empty + end + + it 'respects max iterations limit' do + allow(openai_client).to receive(:chat).and_return(faq_response) + + # Force max iterations + service.instance_variable_set(:@iterations_completed, 19) + + service.generate + expect(service.iterations_completed).to eq(20) + end + end + end + + describe '#should_continue_processing?' do + it 'stops at max iterations' do + service.instance_variable_set(:@iterations_completed, 20) + expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be false + end + + it 'stops when no FAQs returned' do + expect(service.should_continue_processing?(faqs: [], has_content: true)).to be false + end + + it 'continues when FAQs exist and under limits' do + expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be true + end + end +end diff --git a/spec/enterprise/services/captain/llm/pdf_processing_service_spec.rb b/spec/enterprise/services/captain/llm/pdf_processing_service_spec.rb new file mode 100644 index 000000000..9dc416685 --- /dev/null +++ b/spec/enterprise/services/captain/llm/pdf_processing_service_spec.rb @@ -0,0 +1,58 @@ +require 'rails_helper' +require 'custom_exceptions/pdf_processing_error' + +RSpec.describe Captain::Llm::PdfProcessingService do + let(:document) { create(:captain_document) } + let(:service) { described_class.new(document) } + + before do + # Mock OpenAI configuration + installation_config = instance_double(InstallationConfig, value: 'test-api-key') + allow(InstallationConfig).to receive(:find_by!) + .with(name: 'CAPTAIN_OPEN_AI_API_KEY') + .and_return(installation_config) + end + + describe '#process' do + context 'when document already has OpenAI file ID' do + before do + allow(document).to receive(:openai_file_id).and_return('existing-file-id') + end + + it 'skips upload' do + expect(document).not_to receive(:store_openai_file_id) + service.process + end + end + + context 'when uploading PDF to OpenAI' do + let(:mock_client) { instance_double(OpenAI::Client) } + let(:pdf_content) { 'PDF content' } + + before do + allow(document).to receive(:openai_file_id).and_return(nil) + + # Use a simple double for ActiveStorage since it's a complex Rails object + pdf_file = double('pdf_file', download: pdf_content) # rubocop:disable RSpec/VerifiedDoubles + allow(document).to receive(:pdf_file).and_return(pdf_file) + + allow(OpenAI::Client).to receive(:new).and_return(mock_client) + # Use a simple double for OpenAI::Files as it may not be loaded + files_api = double('files_api') # rubocop:disable RSpec/VerifiedDoubles + allow(files_api).to receive(:upload).and_return({ 'id' => 'file-abc123' }) + allow(mock_client).to receive(:files).and_return(files_api) + end + + it 'uploads PDF and stores file ID' do + expect(document).to receive(:store_openai_file_id).with('file-abc123') + service.process + end + + it 'raises error when upload fails' do + allow(mock_client.files).to receive(:upload).and_return({ 'id' => nil }) + + expect { service.process }.to raise_error(CustomExceptions::PdfUploadError) + end + end + end +end diff --git a/spec/fixtures/files/sample.pdf b/spec/fixtures/files/sample.pdf new file mode 100644 index 000000000..9f6471c5d --- /dev/null +++ b/spec/fixtures/files/sample.pdf @@ -0,0 +1,32 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Arial >> >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >> +endobj +4 0 obj +<< /Length 44 >> +stream +BT +/F1 12 Tf +100 700 Td +(Sample PDF) Tj +ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000274 00000 n +trailer +<< /Size 5 /Root 1 0 R >> +startxref +362 +%%EOF