Search code examples
sqlvectorgoogle-bigquerycosine-similarity

BigQuery - find N nearest vectors


I have a bigquery table, which has a column of the repeated data type with a 512 dimensional vector (float).

I would like to run a query which finds the N most similar vectors.

In my case similarity can be simply defined as the inner product of the target vector and each vector in the database.

I have found and run the below query, which generates this across all the combinations in the table:

#standardSQL
CREATE TABLE ml.url_cosine_similarity AS
WITH pairwise AS (
  SELECT t1.url AS id_1, t2.url AS id_2
  FROM `project.dataset.table` t1
  INNER JOIN `project.dataset.table` t2
  ON t1.url < t2.url
)
SELECT id_1, id_2, ( 
  SELECT 
    SUM(value1 * value2)/ 
    SQRT(SUM(value1 * value1))/ 
    SQRT(SUM(value2 * value2))
  FROM UNNEST(a.page_vector) value1 WITH OFFSET pos1 
  JOIN UNNEST(b.page_vector) value2 WITH OFFSET pos2 
  ON pos1 = pos2
  ) cosine_similarity
FROM pairwise t
JOIN `project.dataset.table` a ON a.url = id_1
JOIN `project.dataset.table` b ON b.url = id_2

However since I do not have a good grasp of how arrays work in bigquery, I am unsure on how to change this query to take in a target vector, and return N neighbours.


Solution

  • See simplified example - it returns top 3 nearest pairs of vectors in the table

    #standardSQL
    WITH `project.dataset.table` AS (
      SELECT 1 id, [1,2,3,4,5] page_vector UNION ALL
      SELECT 2, [1,3,4,5,16] UNION ALL
      SELECT 3, [2,3,4,5,6] UNION ALL
      SELECT 4, [2,4,6,8,9] UNION ALL
      SELECT 5, [1,3,4,5,16] UNION ALL
      SELECT 6, [11,12,13,14,15] 
    )
    SELECT a.id id1, b.id id2, ( 
      SELECT 
        SUM(value1 * value2)/ 
        SQRT(SUM(value1 * value1))/ 
        SQRT(SUM(value2 * value2))
      FROM UNNEST(a.page_vector) value1 WITH OFFSET pos1 
      JOIN UNNEST(b.page_vector) value2 WITH OFFSET pos2 
      ON pos1 = pos2
      ) cosine_similarity
    FROM `project.dataset.table` a
    JOIN `project.dataset.table` b
    ON a.id < b.id
    ORDER BY cosine_similarity DESC
    LIMIT 3  
    

    with output

    Row id1 id2 cosine_similarity    
    1   2   5   1.0  
    2   1   4   0.9986422261219272   
    3   3   4   0.9962894120648842    
    

    If you want to output the nearest vectors (let's say two) for every vector in table - see below example

    #standardSQL
    WITH `project.dataset.table` AS (
      SELECT 1 id, [1,2,3,4,5] page_vector UNION ALL
      SELECT 2, [1,3,4,5,16] UNION ALL
      SELECT 3, [2,3,4,5,6] UNION ALL
      SELECT 4, [2,4,6,8,9] UNION ALL
      SELECT 5, [1,3,4,5,16] UNION ALL
      SELECT 6, [11,12,13,14,15] 
    )
    SELECT id, ANY_VALUE(page_vector) page_vector, 
      ARRAY_AGG(
        STRUCT(id2 AS id, page_vector2 AS page_vector, cosine_similarity AS cosine_similarity) 
        ORDER BY cosine_similarity DESC 
        LIMIT 2
      ) similar_vectors
    FROM (
      SELECT a.id, a.page_vector, 
        b.id id2, b.page_vector page_vector2, ( 
        SELECT 
          SUM(value1 * value2)/ 
          SQRT(SUM(value1 * value1))/ 
          SQRT(SUM(value2 * value2))
        FROM UNNEST(a.page_vector) value1 WITH OFFSET pos1 
        JOIN UNNEST(b.page_vector) value2 WITH OFFSET pos2 
        ON pos1 = pos2
        ) cosine_similarity
      FROM `project.dataset.table` a
      JOIN `project.dataset.table` b
      ON a.id != b.id
    )
    GROUP BY id
    ORDER BY id  
    

    this will produce below output

    enter image description here