Search code examples
pythonselenium-webdrivercss-selectors

Selenium getting duplicate item when use if-else to assign value to a dataclass


I trying to scrape products(name, price, img, url) from a wepsite but because the price value could be string currency(e.g: 1.000.000đ) or "Giá Liên Hệ" so i want to check if price = "Giá Liên Hệ" then price = 0 but it seem to return duplicate items.

My dataclass(probably need to change current_price to Decimal):

@dataclass
class Item:
  url: str
  name: str
  current_price: str
  place: str
  img: str
  date_add: datetime.datetime

My func:

def cellphones(query):
  lists = []
  search = query.replace(' ', '%20')
  url = f"https://cellphones.com.vn/catalogsearch/result?q={search}"
  driver.get(url)
  content = driver.find_element(By.CSS_SELECTOR, "div[id*='search-catalog-page']")
  items = content.find_elements(By.CSS_SELECTOR, "div[class*='product-info']")
  for _ in items:
    if _.find_element(By.CSS_SELECTOR, "p[class*='product__price--show']").text == "Giá Liên Hệ":
      item = Item(
        url=_.find_element(By.CSS_SELECTOR, "a").get_attribute('href'),
        name=_.find_element(By.CSS_SELECTOR, "h3").text,
        current_price=Decimal("0"),
        place="Cellphones",
        img=_.find_element(By.CSS_SELECTOR, "img").get_attribute('src'),
        date_add=datetime.datetime.now()
      )
    else:
      item = Item(
        url=_.find_element(By.CSS_SELECTOR, "a").get_attribute('href'),
        name=_.find_element(By.CSS_SELECTOR, "h3").text,
        current_price=Decimal(_.find_element(By.CSS_SELECTOR, "p[class*='product__price--show']").text.replace("₫", "").replace(".","").replace(" ","")),
        place="Cellphones",
        img=_.find_element(By.CSS_SELECTOR, "img").get_attribute('src'),
        date_add=datetime.datetime.now()
      )
    item, created = Product.objects.update_or_create(
        name=item.name,
        place=item.place,
        defaults={
            'current_price': item.current_price,
            'url': item.url,
            'img': item.img,
            'date_add': item.date_add
        }
    )
    lists.append(item)
  return lists

My template:

{% block content %}

<p>You searched '<span>{{context.name}}</span>'</p>
<p>Return {{lists|length}} products.</p>
<ul class="d-flex flex-row flex-wrap align-content-center justify-content-around align-items-center list-unstyled">
    {% for p in lists %}
        <li>
            <div class="card h-100 mt-2 border-dark mb-3" style="max-width: 18rem;">
                <img src="{{p.img}}" class="card-img-top" style="height:17.813em;width:auto;" alt="product image">
                <div class="card-body">
                    <h5 class="card-title">{{p.name}}</h5>
                    {% if p.current_price == 0 %}
                        <p class="card-text">Giá Liên Hệ</p>
                    {% else %}
                        <p class="card-text">{{p.current_price|intcomma}}</p>
                    {% endif %}
                    <p class="card-text">Store: {{p.place}}</p>
                    <p class="card-text">Add at {{p.date_add}}</p>
                    <a href="{{p.url}}" target="_blank" class="btn btn-primary">Go to store</a>
                </div>
            </div>
        </li>
    {% endfor %}
</ul>{% endblock %}

Solution

  • Ok, I give up trying to find clean solutions for this so i just check if item.name existed or not and if not then add it to the list

    def cellphones(query):
      lists = []
      search = query.replace(' ', '%20')
      url = f"https://cellphones.com.vn/catalogsearch/result?q={search}"
      driver.get(url)
      content = driver.find_element(
        By.CSS_SELECTOR, "div[id*='search-catalog-page']")
      items = content.find_elements(
        By.CSS_SELECTOR, "div[class*='product-info']")
      print(items)
      for _ in items:
        try:
            a = _.find_element(By.CSS_SELECTOR, "a").get_attribute('href')
            b = _.find_element(By.CSS_SELECTOR, "h3").text
            c = _.find_element(By.CSS_SELECTOR, "p[class*='product__price--show']").text
            d = _.find_element(By.CSS_SELECTOR, "img").get_attribute('src')
            e = datetime.datetime.now(tz=timezone.utc)
            if c == "Giá Liên Hệ":
                c = Decimal("0")
            else:
                c = Decimal(c.replace("₫", "").replace(".","").replace(" ",""))
            newItem = Item(
                url=a,
                name=b,
                current_price=c,
                place="Cellphones",
                img=d,
                date_add=e
            )
            print(newItem)
            item, created = Product.objects.update_or_create(
                name=newItem.name,
                place=newItem.place,
                defaults={
                    'current_price': newItem.current_price,
                    'url': newItem.url,
                    'img': newItem.img,
                    'date_add': newItem.date_add
                }
            )
            print(item)
            print(created)
            if any(obj.name == item.name for obj in lists):
                pass
            else:
                lists.append(item)
        except:
            pass
      return lists