python django csv django-models django-mptt

Django: Creating categories using querysets/filtering

I'm trying to figure out if it's possible to create categories using custom filters.

I am building an e-commerce app and I have set up my category model using mptt. I am importing a csv that creates my top level categories which works fine. The problem is I will need to have sub-categories that are more specific e.g Men's Clothing(Top Level) > Jeans.

The csv has several fields that contains info relating to each product e.g description: "stone wash bootcut jeans". I would ideally like to check these fields for keywords and add each product to the correct categories. Is it possible to set up categories this way or is there an alternative solution?

I am a django newbie so any help is appreciated.

models.py

from django.db import models
from mptt.models import MPTTModel, TreeForeignKey

class Category(MPTTModel):
    name = models.CharField(max_length=50, unique=True)
    parent = TreeForeignKey('self', null=True, blank=True, related_name='children', db_index=True, on_delete=models.CASCADE)
    slug = models.SlugField()

    class MPTTMeta:
        order_insertion_by = ['name']

    class Meta:
        unique_together = (('parent', 'slug',))
        verbose_name_plural = 'categories'

    def get_slug_list(self):
        try:
            ancestors = self.get_ancestors(include_self=True)
        except:
            ancestors = []
        else:
            ancestors = [ i.slug for i in ancestors]
        slugs = []
        for i in range(len(ancestors)):
            slugs.append('/'.join(ancestors[:i+1]))
        return slugs


    def __str__(self):
        return self.name

class Brands(models.Model):
    brand_name = models.CharField(max_length=500, default='')

    def __str__(self):
        return self.brand_name


class Product(models.Model):
    aw_deep_link = models.CharField(max_length=500, default='')
    description = models.CharField(max_length=500, default='')
    product_name = models.CharField(max_length=500, default='')
    aw_image_url = models.CharField(max_length=500, default='')
    search_price = models.DecimalField(max_digits=6, decimal_places=2, null=True)
    merchant_name = models.CharField(max_length=500, default='')
    display_price = models.CharField(max_length=500, default='')
    brand_name = TreeForeignKey('Brands', on_delete=models.CASCADE)
    colour = models.CharField(max_length=500, default='')
    rrp_price = models.DecimalField(max_digits=6, decimal_places=2, null=True)
    category = TreeForeignKey('Category',null=True,blank=True, on_delete=models.CASCADE)
    slug = models.SlugField(default='')

    def __str__(self):
        return self.product_name

importCSV.py

import re
from products.models import Category, Brands
from django.core.management.base import BaseCommand


class Command(BaseCommand):
    help = "Load some sample data into the db"

    def add_arguments(self, parser):
        parser.add_argument('--file', dest='file', help='File to load')

    def handle(self, **options):
        from products.models import Product

        if options['file']:
            print("Importing " + options['file'])

            with open(options['file']) as f:
                linecount = 0
                next(f)
                for line in f:
                    linecount += 1
                    fields = line.split(',')
                    category = Category.objects.get_or_create(name=fields[10])
                    brand_name = Brands.objects.get_or_create(brand_name=fields[7])

                    data = {
                            'aw_deep_link':  fields[0],
                            'description': fields[1],
                            'product_name': fields[2],
                            'aw_image_url':  fields[3],
                            'search_price':  fields[4],
                            'merchant_name': fields[5],
                            'display_price':  fields[6],
                            'brand_name':  brand_name[0],
                            'colour' :  fields[8],
                            'rrp_price' :  fields[9],
                            'category' :  category[0],

                    }

                    product = Product(**data)
                    product.save()

                print("Added {0} products".format(linecount))

Solution

So you have

Manually predefined subcategories that may comprise more than one keyword
Several text fields for each Product where it is assured that any of the keywords appears at least once

From this setup I would at first try to generalize the "search term" for each subcategroy, maybe by a regex, depending on the complexity of conditions that you need to identify a subcategory. Most probably a list of synonyms is already sufficient. Add such a field to your Category model (here a regex solution):

class Category(models.Model):
    regex = models.CharField(max_length=100, blank=True)  # only needed for subcategories (top level from csv)
    ...

For your example where trainers and runners would be equivalent (to my English knowledge these are plural words here, so not equivalent to trainer or runner appearing anywhere), this would by (as a regex) r'trainers|runners'

This is the part you need define manually - I don't envy your for the tedious work involved ;)

Afterwards, your import loop would need some changes around here:

def handle(self, **options):
    from products.models import Product, Category
    all_categories = list(Category.objects.all())
    # converted to list to evaluate Queryset and don't query again in the loop below

and here

                data = ...
                for textfield in ('description', 'product_name'):
                    # I suppose these are the two relevant fields to scan?
                    subcat = None
                    for cat in all_categories:
                        if re.search(cat.regex, data[textfield]) is not None:
                            if cat.is_leaf_node():
                                # only consider nodes that have no children
                                subcat = cat
                                break
                    if subcat is not None:
                        break
                # subcat is now the first matching subcategory
                if subcat is not None:
                    data['category'] = subcat


                product = Product(**data)

Complete

import re
from products.models import Category, Brands
from django.core.management.base import BaseCommand


class Command(BaseCommand):
    help = "Load some sample data into the db"

    def add_arguments(self, parser):
        parser.add_argument('--file', dest='file', help='File to load')

    def handle(self, **options):
        from products.models import Product, Category
        all_categories = list(Category.objects.all())

        if options['file']:
            print("Importing " + options['file'])

            with open(options['file']) as f:
                linecount = 0
                next(f)
                for line in f:
                    linecount += 1
                    fields = line.split(',')
                    category = Category.objects.get_or_create(name=fields[10])
                    brand_name = Brands.objects.get_or_create(brand_name=fields[7])

                    data = {
                            'aw_deep_link':  fields[0],
                            'description': fields[1],
                            'product_name': fields[2],
                            'aw_image_url':  fields[3],
                            'search_price':  fields[4],
                            'merchant_name': fields[5],
                            'display_price':  fields[6],
                            'brand_name':  brand_name[0],
                            'colour' :  fields[8],
                            'rrp_price' :  fields[9],
                            'category' :  category[0],

                    }

                    for textfield in ('description', 'product_name'):
                        # I suppose these are the two relevant fields to scan?
                        subcat = None
                        for cat in all_categories:
                            if re.search(cat.regex, data[textfield]) is not None:
                                if cat.is_leaf_node():
                                    # only consider nodes that have no children
                                    subcat = cat
                                    break
                        if subcat is not None:
                            break
                    # subcat is now the first matching subcategory
                    if subcat is not None:
                        data['category'] = subcat

                    product = Product(**data)
                    product.save()

                print("Added {0} products".format(linecount))