From 5944595be35b76fbdd5d06d741231239b3a06243 Mon Sep 17 00:00:00 2001
From: andryyy <andre.peters@debinux.de>
Date: Tue, 14 Apr 2020 12:48:57 +0200
Subject: [PATCH] [Watchdog] Watch replication, if any (unsupported)

---
 data/Dockerfiles/dovecot/Dockerfile           |  1 +
 data/Dockerfiles/dovecot/docker-entrypoint.sh | 10 +++--
 data/Dockerfiles/dovecot/repl_health.sh       | 26 ++++++++++++
 data/Dockerfiles/watchdog/watchdog.sh         | 42 +++++++++++++++++++
 docker-compose.yml                            |  5 ++-
 5 files changed, 79 insertions(+), 5 deletions(-)
 create mode 100755 data/Dockerfiles/dovecot/repl_health.sh

diff --git a/data/Dockerfiles/dovecot/Dockerfile b/data/Dockerfiles/dovecot/Dockerfile
index 4c95f929..117da71e 100644
--- a/data/Dockerfiles/dovecot/Dockerfile
+++ b/data/Dockerfiles/dovecot/Dockerfile
@@ -117,6 +117,7 @@ COPY supervisord.conf /etc/supervisor/supervisord.conf
 COPY stop-supervisor.sh /usr/local/sbin/stop-supervisor.sh
 COPY quarantine_notify.py /usr/local/bin/quarantine_notify.py
 COPY quota_notify.py /usr/local/bin/quota_notify.py
+COPY repl_health.sh /usr/local/bin/repl_health.sh
 
 ENTRYPOINT ["/docker-entrypoint.sh"]
 CMD exec /usr/bin/supervisord -c /etc/supervisor/supervisord.conf
diff --git a/data/Dockerfiles/dovecot/docker-entrypoint.sh b/data/Dockerfiles/dovecot/docker-entrypoint.sh
index 265a28cd..fa633447 100755
--- a/data/Dockerfiles/dovecot/docker-entrypoint.sh
+++ b/data/Dockerfiles/dovecot/docker-entrypoint.sh
@@ -285,7 +285,8 @@ chmod +x /usr/lib/dovecot/sieve/rspamd-pipe-ham \
   /usr/local/bin/clean_q_aged.sh \
   /usr/local/bin/maildir_gc.sh \
   /usr/local/sbin/stop-supervisor.sh \
-  /usr/local/bin/quota_notify.py
+  /usr/local/bin/quota_notify.py \
+  /usr/local/bin/repl_health.sh
 
 if [[ "${MASTER}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
 # Setup cronjobs
@@ -297,14 +298,17 @@ echo '30 1 * * *   root  /usr/local/bin/sa-rules.sh  >> /dev/console 2>&1' > /et
 echo '0 2 * * *    root  /usr/bin/curl http://solr:8983/solr/dovecot-fts/update?optimize=true >> /dev/console 2>&1' > /etc/cron.d/solr-optimize
 echo '*/20 * * * * vmail /usr/local/bin/quarantine_notify.py >> /dev/console 2>&1' > /etc/cron.d/quarantine_notify
 echo '15 4 * * * vmail /usr/local/bin/clean_q_aged.sh >> /dev/console 2>&1' > /etc/cron.d/clean_q_aged
-# Fix more than 1 hardlink issue
-touch /etc/crontab /etc/cron.*/*
+echo '*/5 * * * *  vmail /usr/local/bin/repl_health.sh >> /dev/console 2>&1' > /etc/cron.d/repl_health
 else
 echo '25 * * * *   vmail /usr/local/bin/maildir_gc.sh >> /dev/console 2>&1' > /etc/cron.d/maildir_gc
 echo '30 1 * * *   root  /usr/local/bin/sa-rules.sh  >> /dev/console 2>&1' > /etc/cron.d/sa-rules
 echo '0 2 * * *    root  /usr/bin/curl http://solr:8983/solr/dovecot-fts/update?optimize=true >> /dev/console 2>&1' > /etc/cron.d/solr-optimize
+echo '*/5 * * * *  vmail /usr/local/bin/repl_health.sh >> /dev/console 2>&1' > /etc/cron.d/repl_health
 fi
 
+# Fix more than 1 hardlink issue
+touch /etc/crontab /etc/cron.*/*
+
 # Clean old PID if any
 [[ -f /var/run/dovecot/master.pid ]] && rm /var/run/dovecot/master.pid
 
diff --git a/data/Dockerfiles/dovecot/repl_health.sh b/data/Dockerfiles/dovecot/repl_health.sh
new file mode 100755
index 00000000..be17dc1f
--- /dev/null
+++ b/data/Dockerfiles/dovecot/repl_health.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Do not attempt to write to slave
+if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
+  REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}"
+else
+  REDIS_CMDLINE="redis-cli -h redis -p 6379"
+fi
+
+# Is replication active?
+# grep on file is less expensive than doveconf
+if ! grep -qi mail_replica /etc/dovecot/dovecot.conf; then
+  ${REDIS_CMDLINE} SET DOVECOT_REPL_HEALTH 1 > /dev/null
+  exit
+fi
+
+FAILED_SYNCS=$(doveadm replicator status | grep "Waiting 'failed' requests" | grep -oE '[0-9]+')
+
+# Set amount of failed jobs as DOVECOT_REPL_HEALTH
+# 1 failed job for mailcow.local is expected and healthy
+if [[ "${FAILED_SYNCS}" != 1 ]]; then
+  printf "Dovecot replicator has %d failed jobs\n" "${FAILED_SYNCS}"
+  ${REDIS_CMDLINE} SET DOVECOT_REPL_HEALTH "${FAILED_SYNCS}" > /dev/null
+else
+  ${REDIS_CMDLINE} SET DOVECOT_REPL_HEALTH 1 > /dev/null
+fi
diff --git a/data/Dockerfiles/watchdog/watchdog.sh b/data/Dockerfiles/watchdog/watchdog.sh
index 428fd34d..7cf7530b 100755
--- a/data/Dockerfiles/watchdog/watchdog.sh
+++ b/data/Dockerfiles/watchdog/watchdog.sh
@@ -438,6 +438,33 @@ dovecot_checks() {
   return 1
 }
 
+dovecot_repl_checks() {
+  err_count=0
+  diff_c=0
+  THRESHOLD=${DOVECOT_REPL_THRESHOLD}
+  D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
+  # Reduce error count by 2 after restarting an unhealthy container
+  trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
+  while [ ${err_count} -lt ${THRESHOLD} ]; do
+    err_c_cur=${err_count}
+    D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
+    if [[ "${D_REPL_STATUS}" != "1" ]]; then
+      err_count=$(( ${err_count} + 1 ))
+    fi
+    [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
+    [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
+    progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
+    if [[ $? == 10 ]]; then
+      diff_c=0
+      sleep 1
+    else
+      diff_c=0
+      sleep $(( ( RANDOM % 60 ) + 20 ))
+    fi
+  done
+  return 1
+}
+
 phpfpm_checks() {
   err_count=0
   diff_c=0
@@ -807,6 +834,18 @@ PID=$!
 echo "Spawned dovecot_checks with PID ${PID}"
 BACKGROUND_TASKS+=(${PID})
 
+(
+while true; do
+  if ! dovecot_repl_checks; then
+    log_msg "Dovecot hit error limit"
+    echo dovecot_repl_checks > /tmp/com_pipe
+  fi
+done
+) &
+PID=$!
+echo "Spawned dovecot_repl_checks with PID ${PID}"
+BACKGROUND_TASKS+=(${PID})
+
 (
 while true; do
   if ! rspamd_checks; then
@@ -925,6 +964,9 @@ while true; do
   elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
     log_msg "MySQL replication is not working properly"
     [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
+  elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
+    log_msg "Dovecot replication is not working properly" "Please check doveadm replicator status"
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
   elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
     log_msg "acme-mailcow did not complete successfully"
     [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
diff --git a/docker-compose.yml b/docker-compose.yml
index f81f5232..9239895e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -189,7 +189,7 @@ services:
             - sogo
 
     dovecot-mailcow:
-      image: mailcow/dovecot:1.117
+      image: mailcow/dovecot:1.118
       depends_on:
         - mysql-mailcow
       dns:
@@ -394,7 +394,7 @@ services:
         - /lib/modules:/lib/modules:ro
 
     watchdog-mailcow:
-      image: mailcow/watchdog:1.73
+      image: mailcow/watchdog:1.74
       # Debug
       #command: /watchdog.sh
       dns:
@@ -436,6 +436,7 @@ services:
         - POSTFIX_THRESHOLD=8
         - CLAMD_THRESHOLD=15
         - DOVECOT_THRESHOLD=12
+        - DOVECOT_REPL_THRESHOLD=2
         - PHPFPM_THRESHOLD=5
         - RATELIMIT_THRESHOLD=1
         - FAIL2BAN_THRESHOLD=1