Here’s an update:

I sticked to the idea of using collector; it categorizes models from a 
deletion perspective into:

   - fast_deletes - models that can be easily deleted, without checking for 
   other relations
   - update_fields - field that have SET_NULL, i.e. relations to the 
   customer objects that are only updated
   - … all the rest

So I kept the structure from Collector, looping the three categories 
mentioned above. It works ok; for each instance, you’ll have the check 
instance._meta.get_fields() to find the field related to the old customer 
record.
Fixing GenericRelations require the most code, because you’ll have to find 
the content type, object id (and maybe content object) fields that are 
related to the old customer record.

Moreover, you’ll have to be aware that you may violate foreign key 
constraints when replacing the old id.

My implementation is attached, if it’s interesting for anybody.

Have a nice weekend y’all
jools
jools schrieb am Freitag, 21. Januar 2022 um 16:05:58 UTC+1:

> Dear Django users,
>
> here’s an interesting case and I’m curious whether somebody can point me 
> in the right direction.
>
> Here’s the problem:
> My company has a database of customers. Over the years, many duplicates 
> have been created. For cleaning up duplicates, I’d like to have a 
> search-and-replace functionality.
>
> I.e.: Replace all references in the database to customer <old> with 
> customer <new>.
> The database schema has quite a bit of complexity, so I’m aiming to find a 
> very generic solution.
>
> Approach:
> Django has a bit of functionality to find all references to an existing 
> object, namely django.db.models.deletion.Collector and I’m using it to 
> find all references.
>
> Though, the “replace” logic seems quite hard to get right:
>
>    - It has to keep parent links intact
>    - It has to recognize references in parent models (Customer model is 
>    derived from the concrete model Actor)
>    - It has to recognize generic relations, built with Django’s content 
>    types
>
>
> My stub implementation comes below.
>
>
>    - Has anybody else implemented sth like this, e.g. in helper library?
>    - Do you think the approach is right?
>    - What would you differently?
>    - Any caveats that you know of?
>
> Best regards
> Jools
>
> Stub implementation:
>
> from django.db.models.deletion import Collector, ProtectedError, 
> RestrictedError
>
>
> def replace_object(old_obj, new_obj, using='default'):
>     """
>     Replace an old object with a new object throughout the database.
>     """
>     collector = Collector(using)
>
>     try:
>         collector.collect([old_obj])
>     except ProtectedError:
>                 pass
>     except RestrictedError:
>                 pass
>
>     for model, obj in collector.instances_with_model():
>         for field_opts in obj._meta.get_fields():
>             if __is_qualified(field_opts, new_obj):
>                 setattr(obj, field_opts.name, new_obj)
>                 obj.save()
>
> def __is_qualified(field_opts, new_obj):
>     if not hasattr(field_opts, 'related_model'):
>         return False
>     
>     # This check is wrong for multi-table inheritance, I suppose?
>     if field_opts.related_model != new_obj.__class__:
>         return False
>     if hasattr(field_opts, 'parent_link') and field_opts.parent_link is 
> True:
>         return False
>
>     return True
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"Django users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to django-users+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/django-users/1a094625-ab1c-4763-81f2-d8c77ffab870n%40googlegroups.com.
from django.contrib.contenttypes.fields import GenericRelation, GenericForeignKey
from django.contrib.contenttypes.models import ContentType
from django.db.models import sql
from django.db.models.deletion import Collector
from django.db.utils import IntegrityError


class LogMessage:
    def __init__(self, instance, message=""):
        self._instance = instance
        self._message = message

    def __str__(self):
        try:
            model_name = self._instance._meta.verbose_name
        except AttributeError:
            model_name = self._instance.__class__.__name__

        return f'{model_name}, {str(self._instance)}, {self._message}'


class ObjectReplacer:
    def __init__(self):
        self._errors = []
        self._updates = []

    def replace_object(self, old_obj, new_obj, using='default'):
        collector = Collector(using)
        collector.collect([old_obj], keep_parents=True)
        self.__update_fields(collector, new_obj)
        self.__update_fast_deletes(collector, new_obj)
        self.__update_the_rest(collector, old_obj, new_obj)

        self.show_updates()
        self.show_errors()

    def __update_fields(self, collector, new_obj):
        for model, instances_for_fieldvalues in collector.field_updates.items():
            for (field, value), instances in instances_for_fieldvalues.items():
                if self.__update_batch(model, [obj.pk for obj in instances], field.name, new_obj.pk, collector.using):
                    self.add_updates([LogMessage(obj) for obj in instances])

    def __update_fast_deletes(self, collector, new_obj):
        for qs in collector.fast_deletes:
            print(qs)
        return []

    def __update_the_rest(self, collector, old_obj, new_obj):
        # delete instances
        for model, instances in collector.data.items():
            # Ok, django provides us with model and instances.
            # We’ll need to find the referencing field and update with the new PK.
            # print(model, instances)

            for instance in instances:
                for field_opts in instance._meta.get_fields(include_hidden=True):
                    if hasattr(field_opts, 'related_model') and field_opts.related_model == old_obj.__class__:
                        if instance == old_obj:
                            # Don’t attempt to change the old object itself.
                            continue

                        if hasattr(field_opts, 'field') and isinstance(field_opts.field, GenericRelation):
                            self.__update_generic_relation(instance, collector, old_obj, new_obj)
                        else:
                            self.__simple_update(model, instance, collector, new_obj, field_opts)

    def __simple_update(self, model, instance, collector, new_obj, field_opts):
        if self.__update_batch(model, [instance.pk], field_opts.name, new_obj.pk, collector.using):
            self.add_update(LogMessage(instance))

    def __update_generic_relation(self, instance, collector, old_obj, new_obj):
        # If it’s a generic relation, we search the fields for a GenericForeignKey field.
        # This will give us information about the desired content type and object id.
        for gen_field_opts in instance._meta.get_fields():
            if isinstance(gen_field_opts, GenericForeignKey):
                ct_for_this_field = getattr(instance, gen_field_opts.ct_field)
                desired_ct = ContentType.objects.get_for_model(old_obj)

                if ct_for_this_field == desired_ct:
                    if self.__update_batch(instance.__class__, [instance.pk], getattr(gen_field_opts, 'fk_field'),
                                           new_obj.pk, collector.using):
                        self.add_update(LogMessage(instance))
                    return

    def __update_batch(self, model, pks, field_name, new_pk, using):
        query = sql.UpdateQuery(model)
        try:
            query.update_batch(pks, {field_name: new_pk}, using)
        except IntegrityError as error:
            self.add_errors([LogMessage(None, str(error))])
            return False

        return True

    def add_error(self, log_message):
        self._errors.append(log_message)

    def add_errors(self, log_messages):
        self._errors += log_messages

    def add_update(self, log_message):
        self._updates.append(log_message)

    def add_updates(self, log_messages):
        self._updates += log_messages

    def show_updates(self):
        print("Updated:")
        for log_msg in self._updates:
            print("\t", str(log_msg))

    def show_errors(self):
        print("Errors:")
        for log_msg in self._errors:
            print("\t", str(log_msg))

Reply via email to