Deleting a replica can leave a replication vector (RUV) on the other servers. This can confuse things if the replica is re-added, and it also causes the server to calculate changes against a server that may no longer exist.

389-ds-base provides a new task that self-propogates itself to all available replicas to clean this RUV data.

This patch will create this task at deletion time to hopefully clean things up.

It isn't perfect. If any replica is down or unavailable at the time the cleanruv task fires, and then comes back up, the old RUV data may be re-propogated around.

To make things easier in this case I've added two new commands to ipa-replica-manage. The first lists the replication ids of all the servers we have a RUV for. Using this you can call clean_ruv with the replication id of a server that no longer exists to try the cleanallruv step again.

This is quite dangerous though. If you run cleanruv against a replica id that does exist it can cause a loss of data. I believe I've put in enough scary warnings about this.

rob
>From e5a5b19b64e1b81ce560cc5b1edd540b9920a928 Mon Sep 17 00:00:00 2001
From: Rob Crittenden <rcrit...@redhat.com>
Date: Wed, 27 Jun 2012 14:51:45 -0400
Subject: [PATCH] Run the CLEANALLRUV task when deleting a replication
 agreement.

https://fedorahosted.org/freeipa/ticket/2303
---
 install/tools/ipa-replica-manage       |   85 +++++++++++++++++++++++++++++++-
 install/tools/man/ipa-replica-manage.1 |   17 +++++++
 ipaserver/install/replication.py       |   28 +++++++++++
 3 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/install/tools/ipa-replica-manage b/install/tools/ipa-replica-manage
index e2378173821457ed05dae2722223d148266ef822..a72b04a2e1676f0a8008e3181025e53e241d522c 100755
--- a/install/tools/ipa-replica-manage
+++ b/install/tools/ipa-replica-manage
@@ -22,6 +22,7 @@ import os
 
 import ldap, re, krbV
 import traceback
+from urllib2 import urlparse
 
 from ipapython import ipautil
 from ipaserver.install import replication, dsinstance, installutils
@@ -37,6 +38,7 @@ CACERT = "/etc/ipa/ca.crt"
 # dict of command name and tuples of min/max num of args needed
 commands = {
     "list":(0, 1, "[master fqdn]", ""),
+    "list_ruv":(0, 0, "", ""),
     "connect":(1, 2, "<master fqdn> [other master fqdn]",
                     "must provide the name of the servers to connect"),
     "disconnect":(1, 2, "<master fqdn> [other master fqdn]",
@@ -44,7 +46,8 @@ commands = {
     "del":(1, 1, "<master fqdn>",
                     "must provide hostname of master to delete"),
     "re-initialize":(0, 0, "", ""),
-    "force-sync":(0, 0, "", "")
+    "force-sync":(0, 0, "", ""),
+    "clean_ruv":(1, 1, "Replica ID of to clean", ""),
 }
 
 def parse_options():
@@ -229,6 +232,7 @@ def del_link(realm, replica1, replica2, dirman_passwd, force=False):
     if repl2 and type1 == replication.IPA_REPLICA:
         failed = False
         try:
+            repl2.make_readonly()
             repl2.delete_agreement(replica1)
             repl2.delete_referral(replica1)
         except ldap.LDAPError, e:
@@ -251,6 +255,7 @@ def del_link(realm, replica1, replica2, dirman_passwd, force=False):
 
     repl1.delete_agreement(replica2)
     repl1.delete_referral(replica2)
+    repl1.cleanallruv(repl2._get_replica_id(repl2.conn, None))
 
     if type1 == replication.WINSYNC:
         try:
@@ -268,6 +273,80 @@ def del_link(realm, replica1, replica2, dirman_passwd, force=False):
 
     print "Deleted replication agreement from '%s' to '%s'" % (replica1, replica2)
 
+def get_ruv(realm, host, dirman_passwd):
+    """
+    Return the RUV entries as a list of tuples: (hostname, rid)
+    """
+    try:
+        thisrepl = replication.ReplicationManager(realm, host, dirman_passwd)
+    except Exception, e:
+        print "Failed to connect to server %s: %s" % (host, str(e))
+        sys.exit(1)
+
+    search_filter = '(&(nsuniqueid=ffffffff-ffffffff-ffffffff-ffffffff)(objectclass=nstombstone))'
+    try:
+        entries = thisrepl.conn.search_s(api.env.basedn, ldap.SCOPE_ONELEVEL,
+            search_filter, ['nsds50ruv'])
+    except ldap.NO_SUCH_OBJECT:
+        print "No RUV records found."
+        sys.exit(0)
+
+    servers = []
+    for ruv in entries[0].getValues('nsds50ruv'):
+        if ruv.startswith('{replicageneration'):
+            continue
+        data = re.match('\{replica (\d+) (ldap://.*:\d+)\}\s+\w+\s+\w*', ruv)
+        if data:
+            rid = data.group(1)
+            (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(data.group(2))
+            servers.append((netloc, rid))
+        else:
+            print "unable to decode: %s" % ruv
+
+    return servers
+
+def list_ruv(realm, host, dirman_passwd, verbose):
+    """
+    List the Replica Update Vectors on this host to get the available
+    replica IDs.
+    """
+    servers = get_ruv(realm, host, dirman_passwd)
+    for (netloc, rid) in servers:
+        print "%s: %s" % (netloc, rid)
+
+def clean_ruv(realm, ruv, options):
+    """
+    Given an RID create a CLEANALLRUV task to clean it up.
+    """
+    try:
+        ruv = int(ruv)
+    except ValueError:
+        sys.exit("Replica ID must be an integer: %s" % ruv)
+
+    servers = get_ruv(realm, options.host, options.dirman_passwd)
+    found = False
+    for (netloc, rid) in servers:
+        if ruv == int(rid):
+           found = True
+           hostname = netloc
+           break
+
+    if not found:
+        sys.exit("Replica ID %s not found" % ruv)
+
+    print "Clean the Replication Update Vector for %s" % hostname
+    print
+    print "Cleaning the wrong replica ID will cause that server to no"
+    print "longer replicate so it may miss updates while the process"
+    print "is running. It would need to be re-initialized to maintain"
+    print "consistency. Be very careful."
+    if not ipautil.user_input("Continue to clean?", False):
+        sys.exit("Aborted")
+    thisrepl = replication.ReplicationManager(realm, options.host,
+                                              options.dirman_passwd)
+    thisrepl.cleanallruv(ruv)
+    print "Cleanup task created"
+
 def del_master(realm, hostname, options):
 
     force_del = False
@@ -503,6 +582,8 @@ def main():
         if len(args) == 2:
             replica = args[1]
         list_replicas(realm, host, replica, dirman_passwd, options.verbose)
+    elif args[0] == "list_ruv":
+        list_ruv(realm, host, dirman_passwd, options.verbose)
     elif args[0] == "del":
         del_master(realm, args[1], options)
     elif args[0] == "re-initialize":
@@ -531,6 +612,8 @@ def main():
             replica1 = host
             replica2 = args[1]
         del_link(realm, replica1, replica2, dirman_passwd)
+    elif args[0] == "clean_ruv":
+        clean_ruv(realm, args[1], options)
 
 try:
     main()
diff --git a/install/tools/man/ipa-replica-manage.1 b/install/tools/man/ipa-replica-manage.1
index 98103ffdd416f11c44e147e6b4eb84c682da39e0..3973814ca37e847eada4a245a00e8b729f7a85db 100644
--- a/install/tools/man/ipa-replica-manage.1
+++ b/install/tools/man/ipa-replica-manage.1
@@ -47,6 +47,18 @@ The connect and disconnect options are used to manage the replication topology.
 The disconnect option cannot be used to remove the last link of a replica. To remove a replica from the topology use the del option.
 .TP
 If a replica is deleted and then re\-added within a short time\-frame then the 389\-ds instance on the master that created it should be restarted before re\-installing the replica. The master will have the old service principals cached which will cause replication to fail.
+.TP
+\fBlist_ruv\fR
+\- List the replication IDs on this server.
+.TP
+Each IPA master server has a unique replication ID. This ID is used by 389\-ds\-base when storing information about replication status. The output consists of the masters and their respective replication ID. See \fBclean_ruv\fR
+.TP
+\fBclean_ruv\fR [REPLICATION_ID]
+\- Run the CLEANALLRUV task to remove a replication ID.
+.TP
+When a master is removed, all other masters need to remove its replication ID from the list of masters. Normally this occurs automatically when a master is deleted with ipa\-replica\-manage. If one or more masters was down or unreachable when ipa\-replica\-manage was executed then this replica ID may still exist. The clean_ruv command may be used to clean up an unused replication ID.
+.TP
+\fBNOTE\fR: This command is \fBVERY DANGEROUS\fR. Execution against the wrong replication ID can result in inconsistent data on that master. The master should be re\-initialized from another if this happens.
 .SH "OPTIONS"
 .TP
 \fB\-H\fR \fIHOST\fR, \fB\-\-host\fR=\fIHOST\fR
@@ -112,6 +124,11 @@ Completely remove a replica:
  # ipa\-replica\-manage del srv4.example.com
 .TP
 Using connect/disconnect you can manage the replication topology.
+.TP
+List the replication IDs in use:
+ # ipa\-replica\-manage list_ruv
+ srv1.example.com:389: 7
+ srv2.example.com:389: 4
 .SH "WINSYNC"
 Creating a Windows AD Synchronization agreement is similar to creating an IPA replication agreement, there are just a couple of extra steps.
 
diff --git a/ipaserver/install/replication.py b/ipaserver/install/replication.py
index 417b7a0c5ee29615d2479842efc6862e39a7c3df..49cfb9f6f0f960861a5e4660fe679e0c96638aa4 100644
--- a/ipaserver/install/replication.py
+++ b/ipaserver/install/replication.py
@@ -1072,3 +1072,31 @@ class ReplicationManager(object):
 
         if err:
             raise err   #pylint: disable=E0702
+
+    def make_readonly(self):
+        """
+        Make the current replication agreement read-only.
+        """
+        dn = 'cn=userRoot,cn=ldbm database,cn=plugins,cn=config'
+        mod = [(ldap.MOD_REPLACE, 'nsslapd-readonly', 'on')]
+        try:
+            self.conn.modify_s(dn, mod)
+        except ldap.INSUFFICIENT_ACCESS:
+            # We can't make the server we're removing read-only but
+            # this isn't a show-stopper
+            root_logger.debug("No permission to switch replica to read-only, continuing anyway")
+            pass
+
+    def cleanallruv(self, replicaId):
+        """
+        Create a CLEANALLRUV task and monitor it until it has
+        completed.
+        """
+        root_logger.debug("Creating CLEANALLRUV task for replica id %d" % replicaId)
+        dn = self.replica_dn()
+        mod = [(ldap.MOD_REPLACE, 'nsds5task', 'CLEANALLRUV%d' % replicaId)]
+        self.conn.modify_s(dn, mod)
+
+        print "Background task created to clean replication data"
+
+        # FIXME: monitor. https://fedorahosted.org/389/ticket/398
-- 
1.7.10.4

_______________________________________________
Freeipa-devel mailing list
Freeipa-devel@redhat.com
https://www.redhat.com/mailman/listinfo/freeipa-devel

Reply via email to