Search code examples
mysqlcsvcommand-lineimport-from-excelsymfony-3.3

Symfony3 : How to do a massive import from a CSV file as fast as possible?


I have a .csv file with more than 690 000 rows.

I found a solution to import data that works very well but it's a little bit slow... (around 100 records every 3 seconds = 63 hours !!).

How can I improve my code to make it faster ?

I do the import via a console command.

Also, I would like to import only prescribers that aren't already in database (to save time). To complicate things, no field is really unique (except for id).

Two prescribers can have the same lastname, firstname, live in the same city and have the same RPPS and professional codes. But, it's the combination of these 6 fields which makes them unique !

That's why I check on every field before create a new one.

<?php

namespace AppBundle\Command;

use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Helper\ProgressBar;
use AppBundle\Entity\Prescriber;

class PrescribersImportCommand extends ContainerAwareCommand
{
    protected function configure()
    {
        $this
            // the name of the command (the part after "bin/console")
            ->setName('import:prescribers')
            ->setDescription('Import prescribers from .csv file')
        ;
    }

    protected function execute(InputInterface $input, OutputInterface $output)
    {
        // Show when the script is launched
        $now = new \DateTime();
        $output->writeln('<comment>Start : ' . $now->format('d-m-Y G:i:s') . ' ---</comment>');

        // Import CSV on DB via Doctrine ORM
        $this->import($input, $output);

        // Show when the script is over
        $now = new \DateTime();
        $output->writeln('<comment>End : ' . $now->format('d-m-Y G:i:s') . ' ---</comment>');
    }

    protected function import(InputInterface $input, OutputInterface $output)
    {
      $em = $this->getContainer()->get('doctrine')->getManager();

      // Turning off doctrine default logs queries for saving memory
      $em->getConnection()->getConfiguration()->setSQLLogger(null);

      // Get php array of data from CSV
      $data = $this->getData();

      // Start progress
      $size = count($data);
      $progress = new ProgressBar($output, $size);
      $progress->start();

      // Processing on each row of data
      $batchSize = 100; # frequency for persisting the data
      $i = 1;               # current index of records

      foreach($data as $row) {
         $p = $em->getRepository('AppBundle:Prescriber')->findOneBy(array(
                'rpps'       => $row['rpps'],
                'lastname'   => $row['nom'],
                'firstname'  => $row['prenom'],
                'profCode'   => $row['code_prof'],
                'postalCode' => $row['code_postal'],
                'city'       => $row['ville'],
         ));

         # If the prescriber doest not exist we create one
         if(!is_object($p)){
            $p = new Prescriber();
            $p->setRpps($row['rpps']);
            $p->setLastname($row['nom']);
            $p->setFirstname($row['prenom']);
            $p->setProfCode($row['code_prof']);
            $p->setPostalCode($row['code_postal']);
            $p->setCity($row['ville']);
            $em->persist($p);
         }    

         # flush each 100 prescribers persisted
         if (($i % $batchSize) === 0) {
            $em->flush();
            $em->clear();   // Detaches all objects from Doctrine!

            // Advancing for progress display on console
            $progress->advance($batchSize);
            $progress->display();
         }
         $i++;
      }

      // Flushing and clear data on queue
      $em->flush();
      $em->clear();

      // Ending the progress bar process
      $progress->finish();
    }

    protected function getData()
    {
        // Getting the CSV from filesystem
        $fileName = 'web/docs/prescripteurs.csv';

        // Using service for converting CSV to PHP Array
        $converter = $this->getContainer()->get('app.csvtoarray_converter');
        $data = $converter->convert($fileName);

        return $data;
    }
}

EDIT

According to @Jake N answer, here is the final code.

It's very very faster ! 10 minutes to import 653 727 / 693 230 rows (39 503 duplicate items!)

1) Add two columns in my table : created_at and updated_at

2) Add a single index of type UNIQUE on every column of my table (except id and dates) to prevent duplicate items with phpMyAdmin.

3) Add ON DUPLICATE KEY UPDATE in my query, to update just the updated_at column.

foreach($data as $row) {
    $sql = "INSERT INTO prescripteurs (rpps, nom, prenom, code_prof, code_postal, ville)
        VALUES(:rpps, :nom, :prenom, :codeprof, :cp, :ville)
        ON DUPLICATE KEY UPDATE updated_at = NOW()";

    $stmt = $em->getConnection()->prepare($sql);
    $r = $stmt->execute(array(
        'rpps'      => $row['rpps'],
        'nom'       => $row['nom'],
        'prenom'    => $row['prenom'],
        'codeprof'  => $row['code_prof'],
        'cp'        => $row['code_postal'],
        'ville'     => $row['ville'],
    ));

    if (!$r) {
        $progress->clear();
        $output->writeln('<comment>An error occured.</comment>');
        $progress->display();

    } elseif (($i % $batchSize) === 0) {
        $progress->advance($batchSize);
        $progress->display();
    }
    $i++;
}

// Ending the progress bar process
$progress->finish();

Solution

  • 1. Don't use Doctrine

    Try to not use Doctrine if you can, it eats memory and as you have found is slow. Try and use just raw SQL for the import with simple INSERT statements:

    $sql = <<<SQL
    INSERT INTO `category` (`label`, `code`, `is_hidden`) VALUES ('Hello', 'World', '1');
    SQL;
    $stmt = $this->getDoctrine()->getManager()->getConnection()->prepare($sql);
    $stmt->execute();
    

    Or you can prepare the statement with values:

    $sql = <<<SQL
    INSERT INTO `category` (`label`, `code`, `is_hidden`) VALUES (:label, :code, :hidden);
    SQL;
    $stmt = $this->getDoctrine()->getManager()->getConnection()->prepare($sql);
    $stmt->execute(['label' => 'Hello', 'code' => 'World', 'hidden' => 1);
    

    Untested code, but it should get you started as this is how I have done it before.

    2. Index

    Also, for your checks, have you got an index on all those fields? So that the lookup is as quick as possible.