Dne 30.4.2013 03:28, Tim Groeneveld napsal:
Hi Guys,

I am wondering about mail deduplication. I am looking into the possibility of seperating out all of the message bodies with multiple parts inside mail
that is recived from `dovecot` and hashing them all.

The idea is that by hashing all of the parts inside the email, I will be
able to ensure that each part of the email will only be saved once.

This means that attachments & common parts of the body will only be
saved once inside the storage.

How achievable would this be with the current state of dovecot? Would it
even be worth doing?

Thanks,
Tim

Hi Tim,

thank you for your question. I am pleasure, because I can help you. I had the same problem in past and there wasn’t solution. So, I have written script which count md5 hashes from receive date and message body. Then script compare md5 hashes and delete duplicated messages. Script uses doveadm for message manipulation and openssl for counting md5 hashes. Deduplication is done through all user’s mailboxes. Syntax is dedup <user> <mailbox>, for example:

dedup n...@domain.cz INBOX.

If you want dedup all mailboxes, enter –A instead of mailbox name:

dedup n...@domain.cz –A.

Script is attached. I made it for my own use, so it isn’t stupid proof. If I can advise to you, work with care and make a backup ;-)

Good luck

#! /bin/sh

# Remove duplicate messages from mainbox

function dedup_mailbox ()
{
local uids=( $(doveadm -f flow fetch -u $1 "uid" mailbox "$2" all | cut -f 2 -d =) )
if [ ${#uids[@]} -eq 0 ]; then
echo "   No messages"
return
elif [ ${#uids[@]} -eq 1 ]; then
echo "   Only one message"
return
fi

for (( i=0; i<${#uids[@]}; i++ )); do
local md5s_u[$i]=$(echo $(doveadm -f flow fetch -u $1 "date.received body" mailbox "$2" uid ${uids[$i]} | openssl md5)",${uids[$i]}")
echo -en "   Compute hashes: $i/${#uids[@]}(${md5s_u[$i]})\r"
done

echo -en " \r"

local md5s=( $(echo ${md5s_u[@]} | sed 's/ /\n/g' | sort) )

x=0
i=0
while [ $i -lt $((${#md5s[@]} - 1)) ]; do
A=$(echo ${md5s[$i]} | cut -f 1 -d ,)
for (( j=$(($i + 1)); j<${#md5s[@]}; j++ )); do
B=$(echo ${md5s[$j]} | cut -f 1 -d ,)
if [ $A == $B ]; then
doveadm expunge -u $1 mailbox "$2" uid $(echo ${md5s[$j]} | cut -f 2 -d ,)
x=$(($x + 1))
else
break
fi
done

echo -en "   Expunged $x message(s) from $(($j + 1))/${#md5s[@]}\r"
i=$j
done
echo ""
}

if [ $2 == "-A" ]; then
eval boxes=( $(doveadm mailbox list -u $1  | sed 's/.*/"&"/') );
else
boxes[0]=$2
fi

for (( k=0; k<${#boxes[@]}; k++ )); do
echo "${boxes[$k]}:"
dedup_mailbox $1 "${boxes[$k]}"
done
#! /bin/sh

# Remove duplicate messages from mainbox

function dedup_mailbox ()
{
    local uids=( $(doveadm -f flow fetch -u $1 "uid" mailbox "$2" all | cut -f 2 -d =) )
    if [ ${#uids[@]} -eq 0 ]; then
        echo "   No messages"
        return
    elif [ ${#uids[@]} -eq 1 ]; then
        echo "   Only one message"
        return
    fi

    for (( i=0; i<${#uids[@]}; i++ )); do
        local md5s_u[$i]=$(echo $(doveadm -f flow fetch -u $1 "date.received body" mailbox "$2" uid ${uids[$i]} | openssl md5)",${uids[$i]}")
        echo -en "   Compute hashes: $i/${#uids[@]}(${md5s_u[$i]})\r"
    done

    echo -en "                                                                      \r"

    local md5s=( $(echo ${md5s_u[@]} | sed 's/ /\n/g' | sort) )

    x=0
    i=0
    while [ $i -lt $((${#md5s[@]} - 1)) ]; do
        A=$(echo ${md5s[$i]} | cut -f 1 -d ,)
        for (( j=$(($i + 1)); j<${#md5s[@]}; j++ )); do
            B=$(echo ${md5s[$j]} | cut -f 1 -d ,)
            if [ $A == $B ]; then
                doveadm expunge -u $1 mailbox "$2" uid $(echo ${md5s[$j]} | cut -f 2 -d ,)
                x=$(($x + 1))
            else
                break
            fi
        done

        echo -en "   Expunged $x message(s) from $(($j + 1))/${#md5s[@]}\r"
        i=$j
    done
    echo ""
}

if [ $2 == "-A" ]; then
    eval boxes=( $(doveadm mailbox list -u $1  | sed 's/.*/"&"/') );
else 
    boxes[0]=$2
fi

for (( k=0; k<${#boxes[@]}; k++ )); do
    echo "${boxes[$k]}:"
    dedup_mailbox $1 "${boxes[$k]}"
done 

Reply via email to