Search code examples
phplaraveloci8

Why SELECT table with 200.000 records is using too much memory (+2GB)?


I'm selecting data from a VIEW in Oracle that in real size is around 50MB (61 columns and 263.000 rows). I only have one column with data length of 4000 and all others up to 100.

When I selecting using Laravel (split into packages of 10.000 records) it is occupying around 2.5GB in memory.

I have made some search and tried disable log queries using DB::disableQueryLog, $connection->disableQueryLog(), included gc_collect_cycles call after each SELECT and unset the results variable - without any effect.


<?php

use Yajra\Oci8\Connectors\OracleConnector;
use Yajra\Oci8\Oci8Connection;

/**
 * @return Oci8Connection
 * @throws \Exception
 */
private function getOracleConnection()
{
    $config = [
        'driver' => 'oracle',
        'host' => 'host',
        'database' => 'database',
        'port' => 'port',
        'username' => 'user',
        'password' => 'password',
        'charset' => 'charset',
        'schema' => 'schema',
        'options' => [
            \PDO::ATTR_PERSISTENT => true
        ],
    ];
    $connector = new OracleConnector();
    $connection = $connector->connect($config);

    $db = new Oci8Connection($connection, $database);

    return $db;
}

protected function loadSourceSystemData(): Collection
{
    $connection = $this->getOracleConnection();

    DB::disableQueryLog();
    $connection->disableQueryLog();

    $package_size = 10000;

    $return = new Collection();
    $offset = 0;

    while(true) {
        $records = $connection->query()
            ->from('ZVPCP003')
            ->take($package_size)
            ->offset($offset)
            ->get();

        if(empty($records)) break;

        $return = $return->merge($records);
        unset($records);

        gc_collect_cycles();
        $offset += $package_size;
    }

    return $return;
}

My expectation is using less then 1GB at least, that is very high yet but is acceptable.

Update: I have measured the real memory use:

Rows: 10000 | Mem: 124 MB  
Rows: 20000 | Mem: 241 MB  
Rows: 30000 | Mem: 357 MB  
Rows: 40000 | Mem: 474 MB  
Rows: 50000 | Mem: 590 MB  
Rows: 60000 | Mem: 707 MB  
Rows: 70000 | Mem: 825 MB  
Rows: 80000 | Mem: 941 MB  
Rows: 90000 | Mem: 1058 MB  
Rows: 100000 | Mem: 1174 MB  
Rows: 110000 | Mem: 1290 MB  
Rows: 120000 | Mem: 1407 MB  
Rows: 130000 | Mem: 1523 MB  
Rows: 140000 | Mem: 1644 MB  
Rows: 150000 | Mem: 1760 MB  
Rows: 160000 | Mem: 1876 MB  
Rows: 170000 | Mem: 1993 MB  
Rows: 180000 | Mem: 2109 MB  
Rows: 190000 | Mem: 2226 MB  
Rows: 200000 | Mem: 2342 MB  
Rows: 210000 | Mem: 2459 MB  
Rows: 220000 | Mem: 2575 MB  
Rows: 230000 | Mem: 2691 MB  
Rows: 240000 | Mem: 2808 MB  
Rows: 250000 | Mem: 2924 MB  
Rows: 260000 | Mem: 3041 MB  
Rows: 263152 | Mem: 3087 MB  

Solution

  • Maybe by converting loadSourceSystemData to generator and then process data in chunks, this way you will have at most 10000 rows loaded at time, since they are not collected to one big collection they can be automatically freed.

    <?php
    
    function loadSourceSystemData(): iterable
    {
        $connection = $this->getOracleConnection();
        DB::disableQueryLog();
        $connection->disableQueryLog();
        $package_size = 10000;
        $offset = 0;
        do {
            $records = $connection->query()->from('ZVPCP003')
                    ->take($package_size)
                    ->offset($offset)
                    ->get();
            $offset += $package_size;
            yield collect($records);
        } while(!empty($records));
    }
    
    foreach($this->loadSourceSystemData() as $collection) {
        foreach($collection as $row) {
            // Process row here
        }
    }
    

    Update

    I've tried to load data from CSV file to check overhead and when using arrays it takes about 70% more memory then using objects.

    For 500000 rows like "P80,A142900,2012,6,35" array took about 213MB while array of objects 136 MB.

    class Item {
        public $a;
        public $b;
        public $c;
        public $d;
        public $e;
    }
    
    if (($handle = fopen("data.csv", "r")) !== FALSE) {
        $row = 0;
        $list = [];
        while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {
            $item = new Item();
            $item->a = $data[0];
            $item->b = $data[1];
            $item->c = $data[2];
            $item->d = $data[3];
            $item->e = $data[4];
            $list[] = $item;
            //$list[] = $data;
        }
        fclose($handle);
        $m = memory_get_usage(true) / 1000 / 1000;
        echo "Rows ", count($list), " Memory ", $m, "MB \n";
    }
    

    Update 2

    More memory can be saved if data is converted to specific types eg. int and if some columns have lot's of repeating value then caching can be used.

    class Name {
        public $name;
        public function __construct($name)
        {
            $this->name = $name;
        }
    }
    
    class NameCache {
        private $cache = [];
        public function getName(string $name) {
    
            if (isset($this->cache[$name])) {
                return $this->cache[$name];
            }
            $item = new Name($name);
            $this->cache[$name] = $item;
            return $item;
        }
    }
    
    $nameCache = new NameCache();
    $item = new Item();
    $item->a = $nameCache->getName($data[0]);
    $item->b = $nameCache->getName($data[1]);
    $item->c = (int)$data[2];
    $item->d = (int)$data[3];
    $item->e = (int)$data[4];
    

    With this memory was reduced from 136MB to 75MB.