Search code examples
sqlmysqlquery-optimizationentity-attribute-value

MySQL multiple EXISTS statements causes query to hang


I am implementing an entity-attribute-value model to store nationwide property data that will be updated daily: 130 million properties, will probably store billions of values. We also want to search this data on arbitrary attributes, hence using EAV so we can index the values for each attribute and search on them (instead of one massive table with a slew of indexes). I'm using ruby and rails, so queries are built with rails' various macros.

Here is the schema:

CREATE TABLE `properties` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_properties_on_deleted_at_and_aid` (`deleted_at`,`aid`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_attributes` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `name` varchar(100) DEFAULT NULL,
  `value_type` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_property_attributes_on_name` (`name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_booleans` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` tinyint(1) DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvb_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvb_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_dates` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` date DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvd_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvd_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_numerics` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` decimal(18,6) DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvn_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvn_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_strings` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` varchar(255) DEFAULT NULL,
  `created_at` datetime DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvs_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvs_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

The following is the query which is relatively simple in nature (conceptually), but running this will periodically cause mysql to hang in the "statistics" state for what seems like an indefinite amount of time:

SELECT COUNT(*) FROM `properties` WHERE `properties`.`deleted_at` IS NULL AND 
(exists (SELECT 1 FROM `property_value_strings` as t_74653 WHERE t_74653.`deleted_at` IS NULL AND t_74653.`attribute_id` = 48 AND t_74653.`value` = 'NC' AND (t_74653.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_a9e53 WHERE t_a9e53.`deleted_at` IS NULL AND t_a9e53.`attribute_id` = 14 AND t_a9e53.`value` = 'Wake' AND (t_a9e53.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_a225b WHERE t_a225b.`deleted_at` IS NULL AND t_a225b.`attribute_id` = 163 AND t_a225b.`value` IN ('181', '366', '369', '378', '385', '386', '388') AND (t_a225b.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_booleans` as t_2b45b WHERE t_2b45b.`deleted_at` IS NULL AND t_2b45b.`attribute_id` = 2 AND t_2b45b.`value` = TRUE AND (t_2b45b.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_dates` as t_54497 WHERE t_54497.`deleted_at` IS NULL AND t_54497.`attribute_id` = 174 AND t_54497.`value` BETWEEN '2017-05-16' AND '2017-06-15' AND (t_54497.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_a525e WHERE t_a525e.`deleted_at` IS NULL AND t_a525e.`attribute_id` = 175 AND t_a525e.`value` BETWEEN 200000.0 AND 1000000.0 AND (t_a525e.`aid` = `properties`.`aid`))) AND 

# Just checking for existence of these values, not concerned with actual value.
(exists (SELECT 1 FROM `property_value_strings` as t_79c04 WHERE t_79c04.`deleted_at` IS NULL AND t_79c04.`attribute_id` = 39 AND (t_79c04.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_3936d WHERE t_3936d.`deleted_at` IS NULL AND t_3936d.`attribute_id` = 47 AND (t_3936d.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_07d8d WHERE t_07d8d.`deleted_at` IS NULL AND t_07d8d.`attribute_id` = 49 AND (t_07d8d.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_0506f WHERE t_0506f.`deleted_at` IS NULL AND t_0506f.`attribute_id` = 54 AND (t_0506f.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_2172b WHERE t_2172b.`deleted_at` IS NULL AND t_2172b.`attribute_id` = 55 AND (t_2172b.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_dates` as t_b2d33 WHERE t_b2d33.`deleted_at` IS NULL AND t_b2d33.`attribute_id` = 174 AND (t_b2d33.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_cc807 WHERE t_cc807.`deleted_at` IS NULL AND t_cc807.`attribute_id` = 175 AND (t_cc807.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_34123 WHERE t_34123.`deleted_at` IS NULL AND t_34123.`attribute_id` = 0 AND (t_34123.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_74163 WHERE t_74163.`deleted_at` IS NULL AND t_74163.`attribute_id` = 99 AND (t_74163.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_f3994 WHERE t_f3994.`deleted_at` IS NULL AND t_f3994.`attribute_id` = 107 AND (t_f3994.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_ee85d WHERE t_ee85d.`deleted_at` IS NULL AND t_ee85d.`attribute_id` = 108 AND (t_ee85d.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_2f858 WHERE t_2f858.`deleted_at` IS NULL AND t_2f858.`attribute_id` = 109 AND (t_2f858.`aid` = `properties`.`aid`)))

I can run analyze table property_value_strings, which sometimes makes the query finish, but not sure if that is a feasible solution for production usage. When the query works it is very fast (on a sample of 100,000 properties locally, this query returns 7,400 in 43ms).

One thing I have tried so far is to set optimizer_search_depth to 0, but that doesn't seem to have fixed anything.

My questions are:

  1. What is causing mysql to hang in the "statistics" state? I assume it's the number of exists statements being used, but why? Can this be avoided using these queries?
  2. Is this the most efficient way to search these value tables? I'm totally open to rearranging the query if there is a better way to search.

EDIT: I tried out the various solutions people provided, and marked the most helpful answer as the solution, however as @debugger pointed out, EAV has some limitations. After loading 600k properties in my local environment (which is nowhere near our production dataset size) the queries which seemed acceptable on 100k records took 10+ seconds to return results. I'm now ditching the EAV model and exploring other options.


Solution

  • I created a random set of test data, and found your query to be SLOW (more than two minutes).

    After rewriting it, it was fast, under 1 second.

    SELECT COUNT(*)
    FROM `properties` 
    WHERE `properties`.`deleted_at` IS NULL AND 
        (exists (SELECT 1 FROM `property_value_strings` as t_74653 
                 WHERE t_74653.`deleted_at` IS NULL AND t_74653.`attribute_id` = 48 AND t_74653.`value` = 'NC' AND (t_74653.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_strings` as t_a9e53 
                 WHERE t_a9e53.`deleted_at` IS NULL AND t_a9e53.`attribute_id` = 14 AND t_a9e53.`value` = 'Wake' AND (t_a9e53.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_strings` as t_a225b 
                 WHERE t_a225b.`deleted_at` IS NULL AND t_a225b.`attribute_id` = 163 AND t_a225b.`value` IN ('181', '366', '369', '378', '385', '386', '388') AND (t_a225b.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_booleans` as t_2b45b 
                 WHERE t_2b45b.`deleted_at` IS NULL AND t_2b45b.`attribute_id` = 2 AND t_2b45b.`value` = TRUE AND (t_2b45b.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_dates` as t_54497 
                 WHERE t_54497.`deleted_at` IS NULL AND t_54497.`attribute_id` = 174 AND t_54497.`value` BETWEEN '2017-05-16' AND '2017-06-15' AND (t_54497.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_numerics` as t_a525e 
                 WHERE t_a525e.`deleted_at` IS NULL AND t_a525e.`attribute_id` = 175 AND t_a525e.`value` BETWEEN 200000.0 AND 1000000.0 AND (t_a525e.`aid` = `properties`.`aid`))) AND 
        # Just checking for existence of these values, not concerned with actual value.
           (select count(*) FROM (
               select attribute_id, count(*) 
               from property_value_strings 
               where `attribute_id` in (39,47,49,0,99,107,108,109)
               group by attribute_id 
               having count(*)>0
            ) x)=8 AND  
           (select count(*) FROM ( 
               select attribute_id, count(*) 
               from property_value_numerics
               where `attribute_id` in (54,55)
               group by attribute_id 
               having count(*)>0
           ) x) =2 
    ;
    

    Please note that under "Just checking for existence of these values, not concerned with actual value", the values 174 and 175 are not checked, because they are already checked with a value.

    My test set is not very large:

    select 'properties',count(*) from properties p
    union all
    select 'boolans',count(*) from property_value_booleans pvb
    union all
    select 'dates',count(*) from property_value_dates pvd 
    union all
    select 'numerics',count(*) from property_value_numerics pvn 
    union all
    select 'strings',count(*) from property_value_strings pvs 
    
    properties count(*)
    properties 234
    boolans 1201
    dates 1802
    numerics 2403
    strings 10626