diff --git a/core/admin/mailu/configuration.py b/core/admin/mailu/configuration.py index bb7080c9..d324bf8d 100644 --- a/core/admin/mailu/configuration.py +++ b/core/admin/mailu/configuration.py @@ -75,6 +75,8 @@ DEFAULT_CONFIG = { 'API': False, 'WEB_API': '/api', 'API_TOKEN': None, + 'FULL_TEXT_SEARCH': 'en', + 'FULL_TEXT_SEARCH_ATTACHMENTS': False, 'LOG_LEVEL': 'INFO', 'SESSION_KEY_BITS': 128, 'SESSION_TIMEOUT': 3600, diff --git a/core/admin/run_dev.sh b/core/admin/run_dev.sh index 0f7c6e05..3d1fc771 100755 --- a/core/admin/run_dev.sh +++ b/core/admin/run_dev.sh @@ -78,6 +78,7 @@ ENV \ \ ADMIN_ADDRESS="127.0.0.1" \ FRONT_ADDRESS="127.0.0.1" \ + FTS_ATTACHMENTS_ADDRESS="127.0.0.1" \ SMTP_ADDRESS="127.0.0.1" \ IMAP_ADDRESS="127.0.0.1" \ REDIS_ADDRESS="127.0.0.1" \ diff --git a/core/base/Dockerfile b/core/base/Dockerfile index e1087488..2f9c1142 100644 --- a/core/base/Dockerfile +++ b/core/base/Dockerfile @@ -81,6 +81,7 @@ ENV \ PATH="/app/venv/bin:${PATH}" \ ADMIN_ADDRESS="admin" \ FRONT_ADDRESS="front" \ + FTS_ATTACHMENTS_ADDRESS="tika" \ SMTP_ADDRESS="smtp" \ IMAP_ADDRESS="imap" \ OLETOOLS_ADDRESS="oletools" \ diff --git a/core/dovecot/Dockerfile b/core/dovecot/Dockerfile index bb7507cd..2681bcff 100644 --- a/core/dovecot/Dockerfile +++ b/core/dovecot/Dockerfile @@ -7,8 +7,8 @@ ARG VERSION LABEL version=$VERSION RUN set -euxo pipefail \ - ; apk add --no-cache --repository=http://dl-cdn.alpinelinux.org/alpine/edge/main 'dovecot<2.4' dovecot-lmtpd dovecot-pigeonhole-plugin dovecot-pop3d dovecot-submissiond poppler-utils \ - ; apk add --no-cache --repository=http://dl-cdn.alpinelinux.org/alpine/edge/testing dovecot-fts-flatcurve catdoc \ + ; apk add --no-cache --repository=http://dl-cdn.alpinelinux.org/alpine/edge/main 'dovecot<2.4' dovecot-lmtpd dovecot-pigeonhole-plugin dovecot-pop3d dovecot-submissiond \ + ; apk add --no-cache --repository=http://dl-cdn.alpinelinux.org/alpine/edge/testing dovecot-fts-flatcurve \ ; apk add --no-cache rspamd-client \ ; mkdir /var/lib/dovecot diff --git a/core/dovecot/conf/dovecot.conf b/core/dovecot/conf/dovecot.conf index 6f25e99a..c5173787 100644 --- a/core/dovecot/conf/dovecot.conf +++ b/core/dovecot/conf/dovecot.conf @@ -63,11 +63,14 @@ plugin { fts_autoindex = yes fts_enforced = yes fts_autoindex_exclude = \Trash + fts_autoindex_exclude1 = \Junk fts_filters = normalizer-icu stopwords fts_filters_en = lowercase english-possessive stopwords fts_filters_fr = lowercase contractions stopwords - - fts_decoder = decode2text + fts_header_excludes = Received DKIM-* ARC-* X-* x-* Comments Delivered-To Return-Path Authentication-Results Message-ID References In-Reply-To Thread-* Accept-Language Content-* MIME-Version + {% if FULL_TEXT_SEARCH_ATTACHMENTS %} + fts_tika = http://{{ FTS_ATTACHMENTS_ADDRESS }}:9998/tika/ + {% endif %} {% endif %} {% if COMPRESSION in [ 'gz', 'bz2', 'lz4', 'zstd' ] %} @@ -79,16 +82,6 @@ plugin { {% endif %} } -{% if FULL_TEXT_SEARCH %} -service decode2text { - executable = script /usr/libexec/dovecot/decode2text.sh - user = nobody - unix_listener decode2text { - mode = 0666 - } -} -{% endif %} - ############### # Authentication ############### diff --git a/setup/flavors/compose/docker-compose.yml b/setup/flavors/compose/docker-compose.yml index b266fec0..a81f9f44 100644 --- a/setup/flavors/compose/docker-compose.yml +++ b/setup/flavors/compose/docker-compose.yml @@ -98,8 +98,16 @@ services: volumes: - "{{ root }}/mail:/mail" - "{{ root }}/overrides/dovecot:/overrides:ro" + networks: + - default + {% if tika_enabled %} + - fts_attachments + {% endif %} depends_on: - front + {% if tika_enabled %} + - fts_attachments + {% endif %} {% if resolver_enabled %} - resolver dns: @@ -140,6 +148,21 @@ services: {% endif %} {% endif %} +{% if tika_enabled %} + fts_attachments: + image: apache/tika:2.9.0.0-full + hostname: tika + restart: always + networks: + - fts_attachments + depends_on: + {% if resolver_enabled %} + - resolver + dns: + - {{ dns }} + {% endif %} +{% endif %} + antispam: image: ${DOCKER_ORG:-ghcr.io/mailu}/${DOCKER_PREFIX:-}rspamd:${MAILU_VERSION:-{{ version }}} hostname: antispam @@ -257,3 +280,8 @@ networks: driver: bridge internal: true {% endif %} +{% if tika_enabled %} + fts_attachments: + driver: bridge + internal: true +{% endif %} diff --git a/setup/flavors/compose/mailu.env b/setup/flavors/compose/mailu.env index ef95d8f7..9380bab6 100644 --- a/setup/flavors/compose/mailu.env +++ b/setup/flavors/compose/mailu.env @@ -113,7 +113,7 @@ COMPRESSION_LEVEL={{ compression_level }} # IMAP full-text search is enabled by default. # Set the following variable to off in order to disable the feature # or a comma separated list of language codes to support -# FULL_TEXT_SEARCH=off +FULL_TEXT_SEARCH=en ################################### # Web settings @@ -188,3 +188,5 @@ DEFAULT_SPAM_THRESHOLD=80 # This is a mandatory setting for using the RESTful API. API_TOKEN={{ api_token }} +# Whether tika should be enabled (scan/OCR email attachements) +FULL_TEXT_SEARCH_ATTACHMENTS={{ tika_enabled }} diff --git a/setup/templates/steps/compose/02_services.html b/setup/templates/steps/compose/02_services.html index 2311e4a3..afa5e726 100644 --- a/setup/templates/steps/compose/02_services.html +++ b/setup/templates/steps/compose/02_services.html @@ -64,6 +64,15 @@ the security implications caused by such an increase of attack surface.

Oletools scans documents in email attachements for malicious macros. It has a much lower memory footprint than a full-fledged anti-virus. +

+ + + Tika scans documents in email attachments, process (OCR, keyword extraction) and then index them in a way they can be efficiently searched. This requires significant ressources (RAM, CPU and storage). +
+ diff --git a/towncrier/newsfragments/2971.bugfix b/towncrier/newsfragments/2971.bugfix index fd981e75..55d775bb 100644 --- a/towncrier/newsfragments/2971.bugfix +++ b/towncrier/newsfragments/2971.bugfix @@ -1,5 +1,5 @@ - Switch from fts-xapian to fts-flatcurve. This should address the problem with indexes getting too big and will be the default in dovecot 2.4 -- Enable full-text search of email attachments +- Enable full-text search of email attachments if configured (via Tika: you'll need to re-run setup) If you would like more than english to be supported, please ensure you update your FULL_TEXT_SEARCH configuration variable.