I am not sure I would call that script simple but thanks for sharing it. :-)

On Sat, Nov 2, 2019, 16:27 Reindl Harald <h.rei...@thelounge.net> wrote:

>
>
> Am 02.11.19 um 20:39 schrieb Kevin A. McGrail:
> > Yes, I think this will cause problems.  I recommend you pick only about
> > 90 days worth of your corpora and use that. I don't think either
> > sa-learn or txrep handling was ever considered for relearning from the
> > corpora so it's a feature request.
>
> what?
>
> you simply rebuild bayes from corpora in a temporary directory and
> finally move it to the used location, topic done, works for many years
>
> [root@mail-gw:~]$ cat /var/lib/spamass-milter/training/learn.sh
> #!/usr/bin/bash
>
> # Home-Directory und Name des Milter-Users
> SA_MILTER_HOME="/var/lib/spamass-milter"
> SA_MILTER_USER="sa-milt"
>
> # Sicherstellen dass wir nicht als 'root' laufen
> if test `whoami` = "$SA_MILTER_USER"
> then
>  /bin/echo "" > /dev/null
> else
>  /bin/echo "Das Script 'learn.sh' muss als Benutzer '$SA_MILTER_USER'
> aufgerufen werden"
>  exit
> fi
>
> # Aktuellen Datenbank-Status zwischenspeichern um am Ende ggf. rsync zu
> skippen
> /usr/bin/sa-learn --dump magic | grep -v 'oldest atime' | grep -v
> 'newest atime' | grep -v 'last journal sync atime' | grep -v 'last
> expiry atime' | grep -v 'last expire atime delta' | grep -v 'last expire
> reduction count' | grep -v 'bayes db version' > /tmp/bayes_status.txt
> OLD_HASH=`sha512sum /tmp/bayes_status.txt`
>
> # Wechsel in Home-Directory damit sich 'find' nicht beschwert
> cd $SA_MILTER_HOME
>
> # Check ob erster Parameter leer 'rebuild' oder eine Zahl ist
> SHOW_HELP="0"
> if [ "$1" == "rebuild" ] || [ "$1" == "" ] || [ `echo $((($1*2)/2))` ==
> "$1" ]; then
>  # Kompletter Rebuild angefordert (Temp-Folder und Move nach Abschluss)
>  if [ "$1" == "rebuild" ]; then
>   # Temp-Folder sicherstellen in dem wir die neue Datenbank aufbauen
>   BAYES_TEMP="$SA_MILTER_HOME/training/.temp"
>   mkdir "$BAYES_TEMP" 2> /dev/null > /dev/null
>   # Bayes-Reset
>   /usr/bin/sa-learn --dbpath "$BAYES_TEMP/bayes" --clear
>   # SPAM-Training
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Verarbeite SPAM Samples"
>   nice -n 19 /usr/bin/sa-learn --dbpath "$BAYES_TEMP/bayes" --max-size=0
> --no-sync --progress --spam "$SA_MILTER_HOME/training/spam/"
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo ""
>   # HAM-Training
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Verarbeite HAM Samples"
>   nice -n 19 /usr/bin/sa-learn --dbpath "$BAYES_TEMP/bayes" --max-size=0
> --no-sync --progress --ham "$SA_MILTER_HOME/training/ham/"
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Synchronisiere Journal"
>   nice -n 19 /usr/bin/sa-learn --dbpath "$BAYES_TEMP/bayes" --sync
>   echo ""
>   # Nach Rebuild 'bayes_seen' entfernen und leer initialisieren
>   rm -f "$BAYES_TEMP/bayes_seen"
>   /usr/bin/sa-learn --dbpath "$BAYES_TEMP/bayes" --dump magic 2>
> /dev/null > /dev/null
>   # Neu generierte Datenbank in den eigentlichen Folder verschieben
>   mv -f "$BAYES_TEMP/bayes_seen" "$BAYES_TEMP/bayes_toks"
> "$SA_MILTER_HOME/.spamassassin/"
>   rm -f "$SA_MILTER_HOME/.spamassassin/bayes_journal"
>   # Bogofilter ebenfalls neu aufbauen
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Bogofilter: Verarbeite SPAM Samples"
>   /usr/bin/bogofilter --bogofilter-dir=$BAYES_TEMP -s -B
> /var/lib/spamass-milter/training/spam/
>   # Bogofilter-Ham-Training
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Bogofilter: Verarbeite HAM Samples"
>   /usr/bin/bogofilter --bogofilter-dir=$BAYES_TEMP -n -B
> /var/lib/spamass-milter/training/ham/
>   # Neue Bogofilter-Datenbank verschieben
>   mv $BAYES_TEMP/wordlist.db $SA_MILTER_HOME/.spamassassin/
>   # Bogofilter-Training abgeschlossen
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Bogofilter Rebuild abgeschlossen"
>   echo ""
>   # Bogofilter-Datenbank defragmentieren
>   dash /usr/local/bin/workers/bf_compact.sh
>   # Neue Datenbank in persistenten Storage sichern
>   rm -f $SA_MILTER_HOME/.spamassassin/bayes_journal
>   /usr/bin/rsync --quiet --recursive --times --sparse
> --exclude=bayes_journal $SA_MILTER_HOME/.spamassassin/
> /var/lib/bayes-persistent/
>   sync
>   fstrim -a 2> /dev/null
>  # Nur neue Samples direkt in die Live-Datenbank verarbeiten
>  else
>   # Default auf aktuellen Tag oder Parameter
>   if [ "$1" == "" ]; then
>    TRAIN_DAYS="1"
>   else
>    TRAIN_DAYS="$1"
>   fi
>   # SPAM-Training
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Verarbeite SPAM Samples"
>   nice -n 19 /usr/bin/find "$SA_MILTER_HOME/training/spam/" -type f
> -mtime -$TRAIN_DAYS | xargs -r /usr/bin/sa-learn --max-size=0 --no-sync
> --spam
>   nice -n 19 /usr/bin/sa-learn --sync
>   echo ""
>   # HAM-Training
>   MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
>   echo "$MY_TIME: Verarbeite HAM Samples"
>   nice -n 19 /usr/bin/find "$SA_MILTER_HOME/training/ham/" -type f
> -mtime -$TRAIN_DAYS | xargs -r /usr/bin/sa-learn --max-size=0 --no-sync
> --ham
>   nice -n 19 /usr/bin/sa-learn --sync
>   rm -f $SA_MILTER_HOME/.spamassassin/bayes_journal
>   echo ""
>  fi
> else
>  SHOW_HELP="1"
> fi
>
> # Hilfe ausgeben
> if [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$SHOW_HELP" == "1" ];
> then
>  echo "Bayes-Maintaining-Skript"
>  echo "Usage:"
>  echo "  rebuild: Bayes komplett zuruecksetzen und anhand der Samples
> neu aufbauen"
>  echo "  <days>:  Alter der zu trainierenden Samples in Tagen (Default: 1)"
>  exit
> fi
>
> # Training abgeschlossen
> MY_TIME=$(/usr/bin/date "+%d-%m-%Y %H:%M:%S")
> echo "$MY_TIME: Done"
> echo ""
>
> # Bayes-Statistik ausgeben
> TEMP_FILE=`mktemp -u`
> /usr/bin/sa-learn --dump magic | grep -v 'oldest atime' | grep -v
> 'newest atime' | grep -v 'last journal sync atime' | grep -v 'last
> expiry atime' | grep -v 'last expire atime delta' | grep -v 'last expire
> reduction count' | grep -v 'bayes db version' > $TEMP_FILE
> sed -i.bak 's/non-token data: //g' $TEMP_FILE
> sed -i.bak 's/          0  nspam/    SPAM/g' $TEMP_FILE
> sed -i.bak 's/          0  nham/    HAM/g' $TEMP_FILE
> sed -i.bak 's/          0  ntokens/    TOKEN/g' $TEMP_FILE
> sed -i.bak 's/          0//g' $TEMP_FILE
> sed -i.bak 's/0.000/0/g' $TEMP_FILE
> cat $TEMP_FILE
> rm $TEMP_FILE
> rm $TEMP_FILE.bak
> echo ""
>
>
> > On 11/2/2019 1:23 PM, Dean C wrote:
> >> I'm wondering if there's a way to pick up the date from the topmost
> >> Received: header and use that.
> >>
> >> Hmm, come to think of it, bayes_seen has the same issue, it has a
> >> lastupdate field.  bayes_token has an atime field, not sure about that.
> >>
> >> Is this going to cause issues?
>

Reply via email to