Files
chatwoot/enterprise/app/services/page_crawler_service.rb
Sojan Jose 480f34803b feat: Response Bot using GPT and Webpage Sources (#7518)
This commit introduces the ability to associate response sources to an inbox, allowing external webpages to be parsed by Chatwoot. The parsed data is converted into embeddings for use with GPT models when managing customer queries.

The implementation relies on the `pgvector` extension for PostgreSQL. Database migrations related to this feature are handled separately by `Features::ResponseBotService`. A future update will integrate these migrations into the default rails migrations, once compatibility with Postgres extensions across all self-hosted installation options is confirmed.

Additionally, a new GitHub action has been added to the CI pipeline to ensure the execution of specs related to this feature.
2023-07-21 18:11:51 +03:00

39 lines
834 B
Ruby

class PageCrawlerService
attr_reader :external_link
def initialize(external_link)
@external_link = external_link
@doc = Nokogiri::HTML(HTTParty.get(external_link).body)
end
def page_links
sitemap? ? extract_links_from_sitemap : extract_links_from_html
end
def page_title
title_element = @doc.at_xpath('//title')
title_element&.text&.strip
end
def body_text_content
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
end
private
def sitemap?
@external_link.end_with?('.xml')
end
def extract_links_from_sitemap
@doc.xpath('//loc').to_set(&:text)
end
def extract_links_from_html
@doc.xpath('//a/@href').to_set do |link|
absolute_url = URI.join(@external_link, link.value).to_s
absolute_url
end
end
end