Search code examples
pythondjangoscrapymanytomanyfield

How to handle manytomany field with Scrapy


I want to use Scrapy with Django.

My goal is link the actors field to the name field, but I don't know how to deal with Django manytomany. My database is MySQL (I'm not using djangoItem).

models.py

class Movies(models.Model):
    content_ID = models.CharField(max_length=30)
    release_date = models.CharField(max_length=30)
    running_time = models.CharField(max_length=10)
    actors = models.CharField(max_length=300)
    series = models.CharField(max_length=30)
    director = models.CharField(max_length=30)
    label = models.CharField(max_length=30)
    image_urls = models.CharField(max_length=200, null=True)
    images = models.TextField(null=True)
    image_paths = models.TextField(null=True)

    def __str__(self):
        return self.content_ID

class Actors(models.Model):
    names = models.CharField(max_length=100, null=True)
    movielist = models.ManyToManyField(EnMovielist)
    image_urls = models.CharField(max_length=200)
    images = models.TextField(null=True)
    image_paths = models.TextField(null=True)

    def __str__(self):
        return self.name

Solution

  • https://github.com/DevProfi/scrapy-djangoitem to handle with scrapy I use pipeline

    class ItemPersistencePipeline(object):
        def process_item(self, item, spider, partial=True):
            try:
                item_model = item_to_model(item)
            except TypeError:
                return item
            model, created = get_or_create(item_model, spider.unique_fields)
    
            # Если объект модели не создана значит она уже есть и нужно обновить ее
            if not created:
                try:
                    update_model(destination=model, source=item_model, item=item, fields=spider.unique_fields, partial=partial)
                except Exception as e:
                    return e
    
            # Объект модели создан, нужно создать m2m объекты для нее если существуют
            else:
                item_fields_m2m = sorted(item._model_fields_m2m)
                for f in item_fields_m2m:
                    val = item.get(f)
                    if val:
                        getattr(model, f).set(val)
            #             TODO add bulk insert model fields
            # model.related_set.set(new_list)
            return item
    
    
    def update_model(destination, source, item, fields, partial, commit=False):
        # partial включено ли частичное обновление
        # commit испольщзуется для анализа изменился ли объект чтобы зря не сохранять его в базу
        pk = destination.pk
        opts = source._meta
        fields_m2m = sorted(opts.many_to_many)
        field_names_m2m = [f.name for f in fields_m2m]
        source_fields = fields_for_model(source, exclude=field_names_m2m)
    
        for key in source_fields.keys():
            # if key != 'name':
            val_old = getattr(destination, key)
            t = type(val_old)
            try:
                 val_new = (getattr(source, key))
            except ObjectDoesNotExist:
                continue
            if partial:
                if val_new:
                    if val_new != val_old:
                        setattr(destination, key, val_new)
                        commit = True
            else:
                commit = True
                setattr(destination, key, val_new)
        if not pk:
            setattr(destination, 'pk', pk)
    
        if commit:
            destination.save()
    
        # TODO fix for update m2m fields with list
        item_fields_m2m = sorted(item._model_fields_m2m)
    
        for f in item_fields_m2m:
            val_new = item.get(f)
            val_old = list(getattr(destination, f).all())
            if val_new and (val_new not in val_old):
                getattr(destination, f).add(val_new)
    
        return destination