Search code examples
databasepostgresqldistinctmikro-ormdistinct-on

SQL left Join with DISTINCT ON returns duplicate rows


I`m using node with mikro-orm v5 and postgres 14. My schema goes like this:

- Business
  - Addresses[]
  - Sectors[]

And I want to query the addresses by distance.

The problem is that the return is two addresses with the same ID, one for each sector. If I remove the sector.id from the DISTINCT ON it returns one address but only one sector.

How can I return one address and two sectors?

My code:

const addressRepo = em.getRepository(Address);
const distanceQuery = addressRepo.buildDistanceQuery({ latitude, longitude });

const businessName = businessNameRaw?.trim();

// Start query
const qb = em.createQueryBuilder(Address, 'address');

const distinctOnColumns = [
  businessName ? `LEVENSHTEIN(business.business_name, '${businessName}')` : undefined,
  'distance',
  ...(data.orderBy ?? []),
  'sector.id',
  'address.id',
].filter((e) => !!e);

// Must match DISTINCT ON columns order
let orderBy: QBQueryOrderMap<Address> = {
  businessName: QueryOrder.ASC,
  distance: QueryOrder.ASC,
  ...data.orderBy?.reduce((obj, each, index) => ({ ...obj, [each]: data.orderDirection?.[index] }), {}),
  [qb.raw<string>('sector.id')]: QueryOrder.ASC,
  id: QueryOrder.ASC,
};

await qb
  .select([
    qb.raw(`distinct on(${distinctOnColumns.join(', ')}) address.*`),
    qb.raw(`${distanceQuery} as distance`),
  ])
  .leftJoinAndSelect('address.business', 'business')
  .leftJoinAndSelect('business.account', 'account')
  .leftJoinAndSelect('business.sectors', 'sector')

const total =
  (
    await qb
      .clone()
      .select('count(distinct(address.id)) as total')
      .andWhere(`${distanceQuery} < ?`, [distance])
      .execute('get')
  ).total || 0;

await qb
  .having(`${distanceQuery} < ?`, [distance])
  .groupBy([
    'address.id',
    'business.id',
    'account.id',
    'sector.id',
  ])
  .orderBy(orderBy)
  .limit(perPage, offset);

const results = await qb.execute<Address[]>('all');

The query from the code above:

select
  distinct on(
    LEVENSHTEIN(
      business.business_name,
      'Moraes, Macedo and Barros'
    ),
    distance,
    sector.id
  ) address.*,
  ROUND(
    CAST(
      (
        6371 * 2 * ASIN(
          SQRT(
            POWER(
              SIN(
                (-22.896547 - "address"."latitude") * pi() / 180 / 2
              ),
              2
            ) + COS(-22.896547 * pi() / 180) * COS("address"."latitude" * pi() / 180) * POWER(
              SIN(
                (-43.18179 - "address"."longitude") * pi() / 180 / 2
              ),
              2
            )
          )
        )
      ) AS numeric
    ),
    20
  ) as distance,
  "business"."id" as "business__id",
  "business"."updated_at" as "business__updated_at",
  "business"."created_at" as "business__created_at",
  "business"."business_name" as "business__business_name",
  "business"."description" as "business__description",
  "business"."first_name" as "business__first_name",
  "business"."last_name" as "business__last_name",
  "business"."cpf" as "business__cpf",
  "business"."birth_day" as "business__birth_day",
  "business"."employees" as "business__employees",
  "business"."account_id" as "business__account_id",
  "account"."id" as "account__id",
  "account"."updated_at" as "account__updated_at",
  "account"."created_at" as "account__created_at",
  "account"."email" as "account__email",
  "account"."cellphone_number" as "account__cellphone_number",
  "account"."is2faenabled" as "account__is2faenabled",
  "account"."password_change_expires" as "account__password_change_expires",
  "account"."picture_uri" as "account__picture_uri",
  "account"."last_sign_in_at" as "account__last_sign_in_at",
  "account"."last_access_at" as "account__last_access_at",
  "account"."client_id" as "account__client_id",
  "account"."business_id" as "account__business_id",
  "sector"."id" as "sector__id",
  "sector"."updated_at" as "sector__updated_at",
  "sector"."created_at" as "sector__created_at",
  "sector"."name" as "sector__name",
  "sector"."icon" as "sector__icon",
from
  "address" as "address"
  left join "business" as "business" on "address"."business_id" = "business"."id"
  left join "account" as "account" on "business"."account_id" = "account"."id"
  left join "sector_businesses" as "s1" on "business"."id" = "s1"."business_id"
  left join "sector" as "sector" on "s1"."sector_id" = "sector"."id"
where
  (
    unaccent(business.business_name) ILIKE unaccent('%Moraes, Macedo and Barros%')
  )
group by
  "address"."id",
  "business"."id",
  "account"."id",
  "sector"."id",
  "business"."business_name"
having
  (
    ROUND(
      CAST(
        (
          6371 * 2 * ASIN(
            SQRT(
              POWER(
                SIN(
                  (-22.896547 - "address"."latitude") * pi() / 180 / 2
                ),
                2
              ) + COS(-22.896547 * pi() / 180) * COS("address"."latitude" * pi() / 180) * POWER(
                SIN(
                  (-43.18179 - "address"."longitude") * pi() / 180 / 2
                ),
                2
              )
            )
          )
        ) AS numeric
      ),
      20
    ) < 60000
  )
order by
  LEVENSHTEIN(
    business.business_name,
    'Moraes, Macedo and Barros'
  ) asc,
  "distance" asc,
  "sector"."id" asc
limit
  100

And the return. The address returns twice because the sectors has two rows.

{
  "results": [
    {
      "id": "96d13133-a224-44df-b96c-ea938c1bfd6a",
      "line1": "2349 Beatriz Alameda",
      "line2": null,
      "district": "Cambridgeshire",
      "city": "Detroit",
      "stateOrProvince": "ES",
      "country": "CX",
      "postalCode": "25151508",
      "latitude": 71.473,
      "longitude": 99.9465,
      "isMain": true,
      "isBilling": true,
      "distance": "14131.78452896010000000000",
      "business": {
        "id": "133c05a7-ce43-46e4-8cc6-941e889329bf",
        "sectors": [
          {
            "id": "1633946a-ca28-428c-a847-30f61667afbc",
            "name": "Depilação"
          },
          {
            "id": "51ef3ea1-5bdf-45dd-ad10-a3aa3a7a2f33",
            "name": "Cabeleireiro"
          }
        ]
      }
    },
    {
      "id": "96d13133-a224-44df-b96c-ea938c1bfd6a",
      "line1": "2349 Beatriz Alameda",
      "line2": null,
      "district": "Cambridgeshire",
      "city": "Detroit",
      "stateOrProvince": "ES",
      "country": "CX",
      "postalCode": "25151508",
      "latitude": 71.473,
      "longitude": 99.9465,
      "isMain": true,
      "isBilling": true,
      "distance": "14131.78452896010000000000",
      "business": {
        "id": "133c05a7-ce43-46e4-8cc6-941e889329bf",
        "sectors": [
          {
            "id": "1633946a-ca28-428c-a847-30f61667afbc",
            "name": "Depilação"
          },
          {
            "id": "51ef3ea1-5bdf-45dd-ad10-a3aa3a7a2f33",
            "name": "Cabeleireiro"
          }
        ]
      }
    }
  ]
}

Edit 1

Alright, changing to qb.getResult() worked, more or less...

The address results are now unique, but the sectors does not return properly. It returns only one or two when there are far more.

When I add sector.id to DISTINCT ON it returns properly, but the address results does not it returns less than the amount specified by the limit()... but the count returns the right amount.


Edit 2

I tried reproducing my errors and found some others: https://codesandbox.io/s/node-ts-mikro-orm-i4yckf?file=/src/index.ts


Solution

  • If you use qb.execute(), you get the raw database results, and they will contain duplicates if you join a to-many relation (so a collection of entities - 1:m or m:n property).

    This is also the reason why you see that complex query (with nested subqueries) being generated, as if you combine to-many join with a limit clause, it would not work as expected (you would not limit the root entity, as there would be duplicates).

    You should prefer using qb.getResult() which will map the results to entities, it will handle deduplication.