Search code examples
pythonpandasnumpycoordinatesgravity

Identify blocks/groups of words in a document based on a key word and positional data?


Considering the we have following input data table.

import pandas as pd
#Pandas settings to see all the data when printing
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.width', 500)

#Load the data
data_array = [[576, 60, 279, 28, 2, 'LzR', 0, 0], [578, 17, 318, 23, 3, 'U', 0, 0], [371, 21, 279, 24, 2, 'K', 0, 0], [373, 134, 317, 25, 3, 'mq77MJc', 0, 0], [537, 32, 317, 25, 3, '53', 0, 0], [373, 201, 355, 25, 4, '7Q7NZzkAzN', 0, 0], [538, 118, 393, 24, 5, 'oNNbgA', 0, 0], [680, 39, 392, 26, 5, 'J9', 0, 0], [1509, 155, 260, 154, 2, 'd', 0, 0], [1731, 98, 268, 123, 2, 'z8', 0, 0], [1876, 385, 271, 120, 2, 'rUqNDY', 0, 0], [1640, 197, 590, 21, 7, 't5gNVHDXQVJ', 0, 0], [1989, 270, 589, 22, 7, 't3I81fBOE9caUfb', 0, 0], [352, 80, 645, 25, 8, 'i5f3', 0, 1], [454, 245, 645, 25, 8, 'KrqcRA7Se7X7', 1, 1], [719, 60, 645, 27, 8, 'bpN', 0, 1], [1640, 161, 642, 22, 8, 'skAzt6Np4', 0, 0], [1822, 51, 643, 21, 8, 'K59', 0, 0], [2082, 177, 642, 22, 8, 'cwyN7wsMhE', 0, 0], [353, 220, 683, 25, 9, 'O8coFUwMUbE', 0, 1], [597, 17, 683, 25, 9, 'L', 0, 1], [1640, 234, 695, 22, 9, 'oVWEKowWnbT2y', 0, 0], [2080, 179, 695, 22, 9, 'FvjigCiC7h', 0, 0], [351, 79, 721, 24, 10, 'OQN3', 0, 1], [476, 202, 720, 25, 10, 'S2gcfJIDze', 0, 1], [2062, 69, 775, 22, 11, 'n9lN', 0, 0], [2155, 8, 775, 21, 11, 'G', 0, 0], [2188, 35, 775, 21, 11, '9X', 0, 0], [2246, 8, 775, 21, 11, 'v', 0, 0], [353, 81, 1003, 21, 13, 'c7ox8', 0, 0], [461, 325, 1003, 22, 13, 'o9GmMYAW4RrpPBY64p', 0, 0], [351, 101, 1037, 22, 14, '9NF7ii', 0, 0], [477, 146, 1037, 21, 14, 'MwTlIkU9', 0, 0], [350, 70, 1071, 22, 15, 'J5XF', 0, 0], [443, 87, 1071, 22, 15, '3m4tM', 0, 0], [553, 32, 1071, 22, 15, 'Ck', 0, 0], [609, 10, 1071, 22, 15, '5', 0, 0], [643, 53, 1071, 22, 15, 'X7Y', 0, 0], [1568, 135, 1092, 20, 16, 'P4', 0, 0], [352, 142, 1105, 22, 16, 'Pjs1GYSG', 0, 0], [516, 45, 1105, 22, 16, 'o9V', 0, 0], [588, 106, 1105, 22, 16, 'WRI8oY', 0, 0], [1563, 132, 1117, 20, 16, '3cZY', 0, 0], [350, 69, 1140, 21, 17, 'GW3y', 0, 0], [441, 35, 1139, 22, 17, 'EO', 0, 0], [497, 51, 1139, 28, 17, 'toN', 0, 0], [570, 49, 1140, 21, 17, 'k11', 0, 0], [643, 51, 1139, 22, 17, 'pod', 0, 0], [715, 89, 1140, 21, 17, '6SQfv', 0, 0], [825, 83, 1139, 22, 17, 'CzC2M', 0, 0], [934, 102, 1140, 21, 17, 'aowjQC', 0, 0], [1062, 51, 1140, 21, 17, 'BtC', 0, 0], [1558, 136, 1142, 20, 17, 'XhJ', 0, 0], [1722, 336, 1115, 25, 16, 'OgtXP2nxOwP7Gb3I', 0, 0], [352, 125, 1174, 21, 18, 'zYmvutc', 0, 0], [498, 45, 1174, 21, 18, 'JvN', 0, 0], [570, 124, 1174, 21, 18, 'TyZdJG4', 0, 0], [352, 64, 1207, 22, 19, 'Lvam', 0, 0], [443, 45, 1208, 21, 19, 'Onk', 0, 0], [516, 123, 1208, 21, 19, 'bPgi7tF', 0, 0], [1946, 12, 1231, 11, 20, 'I', 0, 0], [351, 106, 1241, 23, 20, 'xbAa7n', 0, 0], [479, 306, 1242, 22, 20, 'NEn7uifO17vkyzVVp', 0, 0], [1300, 142, 1242, 27, 20, 'dZukserV', 0, 0], [352, 178, 1275, 34, 21, 'qrxWKyJjjn', 0, 0], [557, 60, 1275, 28, 21, '2Ri5', 0, 0], [1354, 88, 1276, 27, 21, 'ZCp3F', 0, 0], [1558, 197, 1231, 63, 20, 'YgoGs', 0, 0], [1787, 96, 1247, 63, 20, 'Um', 0, 0], [1913, 268, 1231, 63, 20, 'YL7fkaV', 0, 0], [351, 70, 1309, 23, 22, 'kcGD', 0, 0], [443, 142, 1309, 23, 22, 'lGAx6Ljx', 0, 0], [605, 35, 1310, 21, 22, 'Hm', 0, 0], [661, 142, 1310, 27, 22, 'S8gZ5tPE', 0, 0], [1302, 135, 1310, 27, 22, 'gjgVPImz', 0, 0], [1743, 12, 1329, 11, 23, 'Z', 0, 0], [2055, 16, 1324, 17, 23, 'i', 0, 0], [353, 11, 1344, 21, 24, 'L', 0, 0], [386, 53, 1344, 21, 24, 'Q5J', 0, 0], [1300, 142, 1344, 27, 24, '9L9ScEj2', 0, 0], [1558, 400, 1345, 63, 24, 'S8YyUDnXd', 0, 0], [1993, 91, 1345, 62, 24, '4P', 0, 0], [1555, 102, 1605, 35, 25, 'kbGP', 0, 2], [1674, 371, 1605, 44, 25, 'DO1tvoEyiX9AVz6Q', 0, 2], [2062, 147, 1605, 44, 25, 'DtQAa3', 2, 2], [1554, 53, 1669, 35, 26, 'pg', 0, 2], [1624, 104, 1660, 34, 26, 'ZPsJ', 0, 2], [1746, 221, 1659, 38, 26, '7CBPYAUA', 0, 2], [1987, 50, 1657, 46, 26, 'AL', 0, 2], [1555, 407, 1714, 44, 27, 'LA3ShdHUE3DAoOkfiB', 0, 2], [188, 1826, 2340, 3, 29, '4', 0, 0], [2024, 217, 2309, 34, 28, 'DLpZXhKepjdcyW', 0, 0], [2239, 119, 2310, 33, 28, '28otEfj9', 0, 0], [230, 77, 2349, 23, 29, 'Th1YC4R', 0, 0], [476, 89, 2349, 18, 29, 'uFRt5qEx', 0, 0], [1140, 463, 2388, 35, 30, 'Mxcsoj1MOubuEB33', 0, 0], [1708, 40, 2372, 17, 30, 'OfA', 0, 9], [1758, 81, 2372, 22, 30, 'ZQoO7mwr', 0, 9], [1848, 3, 2372, 17, 30, 'M', 0, 9], [1860, 134, 2372, 22, 30, 'IvtUnQ4Zxc29A', 0, 9], [2002, 20, 2376, 13, 30, '3V', 0, 9], [2029, 32, 2372, 17, 30, '6t8', 0, 9], [2070, 133, 2372, 17, 30, 'PdCWscuWGHR', 0, 9], [1709, 171, 2398, 22, 30, 'RsW4Oj1Lhf1ljQV4G', 0, 9], [1890, 148, 2398, 22, 30, 'VSUJUa3tuYIhiXxP', 9, 9], [2048, 34, 2398, 17, 30, 'aAm', 0, 9], [2089, 21, 2403, 12, 30, 'uY', 0, 9], [2118, 53, 2398, 17, 30, '6DDFv', 0, 9], [2179, 28, 2398, 17, 30, 'DKJ', 0, 9], [2214, 66, 2398, 17, 30, 'NBmY9BD', 0, 9], [2289, 57, 2398, 18, 30, 'sYsrT', 0, 9], [1708, 25, 2425, 17, 31, 'jGk', 0, 9], [1736, 34, 2429, 13, 31, 'oX', 0, 9], [1778, 93, 2425, 17, 31, 'OvpfEyhHso', 0, 9], [120, 131, 2510, 23, 32, 'rZCsYsA6im2b', 0, 0], [260, 25, 2515, 18, 32, 'G6', 0, 0], [295, 107, 2510, 18, 32, 'd6eYwhzZuS', 0, 0], [132, 88, 2582, 22, 34, 'Xc84', 3, 3], [231, 223, 2582, 22, 34, 'MnMcBUHVmhl2', 0, 3], [463, 47, 2582, 22, 34, 'Vto', 0, 3], [132, 194, 2616, 22, 35, 'B4f1f4KpCHC', 0, 3], [338, 14, 2616, 22, 35, 'W', 0, 3], [131, 64, 2650, 22, 36, 'UW6t', 0, 3], [216, 181, 2650, 22, 36, 'hLULWi7xdj', 0, 3], [1044, 175, 2510, 18, 32, 'F9f7jvsfmjnXbK', 0, 0], [1226, 25, 2515, 18, 32, 'Vk', 0, 0], [1261, 177, 2510, 23, 32, 'TBlYLSoItzHKpG', 0, 0], [1054, 132, 2544, 22, 33, 'u4vvPgHd', 0, 0], [1053, 36, 2590, 21, 34, 'lN', 0, 4], [1101, 107, 2589, 23, 34, 'ieee4D', 0, 4], [1218, 47, 2589, 23, 34, 'kD6', 0, 4], [1054, 122, 2623, 23, 35, 'Ngf2xWa', 0, 4], [1189, 132, 2624, 22, 35, 'N27RyHsP', 0, 4], [1054, 204, 2657, 23, 36, 'e97JFxWTXfS', 0, 4], [1262, 43, 2658, 22, 36, 'p', 4, 4], [1054, 65, 2692, 22, 37, 'mle1', 0, 4], [1139, 186, 2691, 23, 37, 'o6tA5wFrK', 0, 4], [1337, 39, 2691, 23, 37, 'W3', 0, 4], [1709, 175, 2510, 18, 32, 'DQm27gIhcjmkdB', 0, 0], [1892, 25, 2515, 18, 32, '4Z', 0, 0], [1927, 176, 2510, 23, 32, 'rAP1PxzMyqkxdY', 0, 0], [1720, 132, 2544, 22, 33, 'JpsQeikW', 0, 0], [1719, 35, 2590, 21, 34, 'hD', 0, 5], [1766, 107, 2589, 23, 34, '3vzIwR', 0, 5], [1884, 47, 2589, 23, 34, 'kHw', 0, 5], [1720, 122, 2623, 23, 35, 'MYOKedL', 0, 5], [1854, 132, 2624, 22, 35, 'K8JXFVII', 5, 5], [1720, 204, 2657, 23, 36, 'bBkPRmgyfVp', 0, 5], [1928, 43, 2658, 22, 36, 'j', 0, 5], [1719, 65, 2692, 22, 37, 'RfU4', 0, 5], [1805, 185, 2691, 23, 37, 'wtK1L23Q4', 0, 5], [2003, 38, 2692, 22, 37, 'yY', 0, 5], [130, 255, 2804, 23, 38, 'jgoGjNh2DoLnb2b4PGonGvU', 0, 0], [1044, 117, 2804, 18, 38, 'qGXS7f7gRHy', 0, 0], [1168, 38, 2804, 18, 38, 'UQI', 0, 0], [1215, 102, 2804, 18, 38, 'P764bscKkx', 0, 0], [1320, 38, 2804, 18, 38, 'OtH', 0, 0], [1368, 58, 2804, 18, 38, 'VhrUJ', 0, 0], [1709, 100, 2804, 23, 38, 'zjQgoufCGU', 0, 0], [131, 55, 2852, 21, 40, 'piH', 0, 0], [198, 41, 2858, 15, 40, 'wU6P', 0, 0], [281, 124, 2852, 21, 40, 'riQCT4RX', 0, 0], [454, 138, 2852, 27, 40, 'jSAJPlWhyRE', 0, 0], [612, 77, 2852, 21, 40, 'nVS97', 0, 0], [131, 227, 2886, 21, 41, 'zExU7Poi4QW', 0, 0], [375, 235, 2886, 21, 41, 'pLTfHVP1qzb7Mh2', 0, 0], [138, 100, 2957, 15, 42, 'fv8', 0, 0], [1404, 4, 2978, 4, 42, 'B', 0, 0], [130, 103, 2975, 34, 42, 'qpg', 0, 0], [253, 252, 2974, 19, 42, 'T9SOmYWl4CUrdt8o', 0, 0], [1078, 3, 2972, 40, 42, 'S5', 0, 0], [1103, 62, 2978, 28, 42, 'L6W', 0, 0], [1181, 56, 2978, 28, 42, 'ep1', 0, 0], [1253, 118, 2978, 28, 42, 'oKhrqlI', 0, 0], [1384, 45, 2985, 21, 42, 'OyP', 0, 0], [1444, 132, 2978, 28, 42, 'mvg8Bw5', 0, 0], [1593, 55, 2972, 76, 42, 'eG', 0, 0], [218, 5, 3074, 18, 44, 'z', 0, 0], [231, 72, 3058, 18, 44, 'x1Pat7', 0, 0], [605, 5, 3074, 18, 44, 'P', 0, 0], [617, 39, 3058, 18, 44, 'dNT', 0, 0], [1053, 146, 3058, 23, 44, 'q7CLeOJhnI1oa', 0, 0], [1802, 5, 3074, 18, 44, '6', 0, 0], [1815, 72, 3058, 18, 44, 'acKa9h', 0, 0], [2119, 50, 3057, 35, 44, 'uGH', 0, 0], [461, 129, 3125, 29, 45, 'p6L5U', 0, 0], [623, 44, 3125, 29, 45, 'dC', 0, 0], [1046, 266, 3125, 29, 45, '9HBoqUyRbg', 0, 0], [1975, 129, 3125, 29, 45, 'qH1ph', 0, 0], [2136, 45, 3125, 29, 45, 'gG', 0, 0], [218, 5, 3183, 20, 46, 'j', 0, 0], [605, 5, 3183, 20, 46, 'o', 0, 0], [119, 24, 3213, 18, 47, 'QDN', 0, 8], [153, 94, 3213, 18, 47, 'EleVpvP4', 0, 8], [256, 105, 3213, 23, 47, 'dq9L2xQO7', 0, 8], [370, 7, 3223, 2, 47, 'n', 0, 8], [386, 69, 3212, 24, 47, 'L9EKl', 0, 8], [464, 83, 3213, 23, 47, 'AnF2rBIN', 0, 8], [555, 19, 3214, 17, 47, 'k6', 0, 8], [582, 62, 3213, 18, 47, 'y3M3kx', 8, 8], [654, 2, 3213, 18, 47, '1', 0, 8], [666, 139, 3212, 19, 47, 'SkmavPFrrrSv', 0, 8], [808, 52, 3213, 18, 47, 'bJ5S', 0, 8], [200, 100, 3316, 29, 50, 'NmNa', 0, 7], [336, 675, 3316, 29, 50, 'vB759g8XWkL7XXe5tCHZs7tAF', 7, 7], [1046, 42, 3203, 23, 47, 'v4T', 0, 0], [1095, 150, 3202, 19, 47, 'NH7vM6', 0, 0], [1251, 24, 3199, 22, 47, '47', 0, 0], [1802, 5, 3183, 20, 46, 'B', 0, 0], [2119, 5, 3183, 20, 46, 'b', 0, 0], [1714, 254, 3213, 23, 47, '2Za9eGyQyKp4S2rVYahzJNM', 0, 0], [1715, 55, 3261, 21, 48, 'djv', 0, 6], [1781, 41, 3267, 15, 48, '3WHD', 0, 6], [1864, 124, 3261, 21, 48, '8ucAV2oj', 0, 6], [2037, 139, 3261, 27, 48, 'baUoLawp6rY', 0, 6], [2196, 76, 3261, 21, 48, 'sRheu', 6, 6], [1715, 226, 3295, 21, 49, 'hAfhkKsI7Jx', 0, 6], [1959, 234, 3295, 21, 49, 'quecbSW4gEdjSGG', 0, 6], [1715, 176, 3329, 27, 50, 'ciaZR8NxiuEXr1', 0, 6], [1910, 140, 3329, 21, 50, 'vicUyHPNcN', 0, 6]]
data_pd =  pd.DataFrame(data_array,columns=["left", "width", "top", "height", "lineNr", "randomWord", "keyWord", "keyWordGroup"])
print(data_pd)

The table contains a main column randomWord and a few other columns with positional coordinates of each word within the document.

To help visualize the data. I wrote this code which makes an image out of the table for a better visualization and understanding of the problem

from PIL import Image, ImageFont, ImageDraw # pip install Pillow
import random

#Create a empty image object
new_im = Image.new('RGB', ((data_pd["left"]+data_pd["width"]).max() + data_pd["left"].min(), (data_pd["top"]+data_pd["height"]).max()  + data_pd["top"].min() ), (255,255,255))
draw_new_im = ImageDraw.Draw(new_im)

#Create a dictioanry with random colors to assign for each uniq keyWordGroup
uniqGroups = data_pd["keyWordGroup"].unique()
colors = {}
for g in uniqGroups:
    if(g == 0):
        colors[str(g)] = "black" # assign black color for non groups
    else:
        colors[str(g)] =  "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) #generate random color

#Write text to the image
for i, row in data_pd.iterrows():
    draw_new_im.text((int(row["left"]), int(row["top"])), str(row["randomWord"]) ,  fill=colors[str(row["keyWordGroup"])],font=ImageFont.truetype("arial.ttf", int(row["height"])))

#Save the image
new_im.save("TestImage.jpg")

As you can see we have the keyWord column. This column contains the IDs of some key words that we need to find the closest the block/group of text they belongs to.

The question of this post is the following: How can we identify the group/block of text closest to the key word in the keyWord column ? As you see in the image generated, for each keyWord ID we find all the words which are in the proximity and forming a block of text.

The output that i am looking for is in column keyWordGroup which is an example of which words were assign to the key words.

Is there any method we could use to find these block of text based on the keywords and the rest of the positional data given?


Solution

  • The solution comprises two steps:

    1. Group words to closest keyword (I wouldn't call it clustering as the centers of the groups are already given here, as opposed to clustering where you try to find clusters with no a priori known locations)
    2. Remove outliers that don't seem to really belong to this keyword, although this keywords is the closest by distance.

    Grouping is straightforward by assigning keyword numbers by distance using vector quantization. The only thing we have to bear in mind here is that the keyword numbers in the original dataframe don't appear in ordered sequence, but in vq groups are numbered in sequence starting with 0. That's why we have to map the new keyword group numbers to the given keyword numbers.

    Removing outliers can be done in different ways, and there were no strict requirements in the question on how the keyword groups should be formed. I chose a very simple approach: take the mean and standard deviation of the distances from the keyword to all keyword group members and consider words with distances greater than Mean + x * StdDev as outliers. A choice of x = 1.5 gives good results.

    import pandas as pd
    
    #Load the data
    data_array = [[576, 60, 279, 28, 2, 'LzR', 0, 0], [578, 17, 318, 23, 3, 'U', 0, 0], [371, 21, 279, 24, 2, 'K', 0, 0], [373, 134, 317, 25, 3, 'mq77MJc', 0, 0], [537, 32, 317, 25, 3, '53', 0, 0], [373, 201, 355, 25, 4, '7Q7NZzkAzN', 0, 0], [538, 118, 393, 24, 5, 'oNNbgA', 0, 0], [680, 39, 392, 26, 5, 'J9', 0, 0], [1509, 155, 260, 154, 2, 'd', 0, 0], [1731, 98, 268, 123, 2, 'z8', 0, 0], [1876, 385, 271, 120, 2, 'rUqNDY', 0, 0], [1640, 197, 590, 21, 7, 't5gNVHDXQVJ', 0, 0], [1989, 270, 589, 22, 7, 't3I81fBOE9caUfb', 0, 0], [352, 80, 645, 25, 8, 'i5f3', 0, 1], [454, 245, 645, 25, 8, 'KrqcRA7Se7X7', 1, 1], [719, 60, 645, 27, 8, 'bpN', 0, 1], [1640, 161, 642, 22, 8, 'skAzt6Np4', 0, 0], [1822, 51, 643, 21, 8, 'K59', 0, 0], [2082, 177, 642, 22, 8, 'cwyN7wsMhE', 0, 0], [353, 220, 683, 25, 9, 'O8coFUwMUbE', 0, 1], [597, 17, 683, 25, 9, 'L', 0, 1], [1640, 234, 695, 22, 9, 'oVWEKowWnbT2y', 0, 0], [2080, 179, 695, 22, 9, 'FvjigCiC7h', 0, 0], [351, 79, 721, 24, 10, 'OQN3', 0, 1], [476, 202, 720, 25, 10, 'S2gcfJIDze', 0, 1], [2062, 69, 775, 22, 11, 'n9lN', 0, 0], [2155, 8, 775, 21, 11, 'G', 0, 0], [2188, 35, 775, 21, 11, '9X', 0, 0], [2246, 8, 775, 21, 11, 'v', 0, 0], [353, 81, 1003, 21, 13, 'c7ox8', 0, 0], [461, 325, 1003, 22, 13, 'o9GmMYAW4RrpPBY64p', 0, 0], [351, 101, 1037, 22, 14, '9NF7ii', 0, 0], [477, 146, 1037, 21, 14, 'MwTlIkU9', 0, 0], [350, 70, 1071, 22, 15, 'J5XF', 0, 0], [443, 87, 1071, 22, 15, '3m4tM', 0, 0], [553, 32, 1071, 22, 15, 'Ck', 0, 0], [609, 10, 1071, 22, 15, '5', 0, 0], [643, 53, 1071, 22, 15, 'X7Y', 0, 0], [1568, 135, 1092, 20, 16, 'P4', 0, 0], [352, 142, 1105, 22, 16, 'Pjs1GYSG', 0, 0], [516, 45, 1105, 22, 16, 'o9V', 0, 0], [588, 106, 1105, 22, 16, 'WRI8oY', 0, 0], [1563, 132, 1117, 20, 16, '3cZY', 0, 0], [350, 69, 1140, 21, 17, 'GW3y', 0, 0], [441, 35, 1139, 22, 17, 'EO', 0, 0], [497, 51, 1139, 28, 17, 'toN', 0, 0], [570, 49, 1140, 21, 17, 'k11', 0, 0], [643, 51, 1139, 22, 17, 'pod', 0, 0], [715, 89, 1140, 21, 17, '6SQfv', 0, 0], [825, 83, 1139, 22, 17, 'CzC2M', 0, 0], [934, 102, 1140, 21, 17, 'aowjQC', 0, 0], [1062, 51, 1140, 21, 17, 'BtC', 0, 0], [1558, 136, 1142, 20, 17, 'XhJ', 0, 0], [1722, 336, 1115, 25, 16, 'OgtXP2nxOwP7Gb3I', 0, 0], [352, 125, 1174, 21, 18, 'zYmvutc', 0, 0], [498, 45, 1174, 21, 18, 'JvN', 0, 0], [570, 124, 1174, 21, 18, 'TyZdJG4', 0, 0], [352, 64, 1207, 22, 19, 'Lvam', 0, 0], [443, 45, 1208, 21, 19, 'Onk', 0, 0], [516, 123, 1208, 21, 19, 'bPgi7tF', 0, 0], [1946, 12, 1231, 11, 20, 'I', 0, 0], [351, 106, 1241, 23, 20, 'xbAa7n', 0, 0], [479, 306, 1242, 22, 20, 'NEn7uifO17vkyzVVp', 0, 0], [1300, 142, 1242, 27, 20, 'dZukserV', 0, 0], [352, 178, 1275, 34, 21, 'qrxWKyJjjn', 0, 0], [557, 60, 1275, 28, 21, '2Ri5', 0, 0], [1354, 88, 1276, 27, 21, 'ZCp3F', 0, 0], [1558, 197, 1231, 63, 20, 'YgoGs', 0, 0], [1787, 96, 1247, 63, 20, 'Um', 0, 0], [1913, 268, 1231, 63, 20, 'YL7fkaV', 0, 0], [351, 70, 1309, 23, 22, 'kcGD', 0, 0], [443, 142, 1309, 23, 22, 'lGAx6Ljx', 0, 0], [605, 35, 1310, 21, 22, 'Hm', 0, 0], [661, 142, 1310, 27, 22, 'S8gZ5tPE', 0, 0], [1302, 135, 1310, 27, 22, 'gjgVPImz', 0, 0], [1743, 12, 1329, 11, 23, 'Z', 0, 0], [2055, 16, 1324, 17, 23, 'i', 0, 0], [353, 11, 1344, 21, 24, 'L', 0, 0], [386, 53, 1344, 21, 24, 'Q5J', 0, 0], [1300, 142, 1344, 27, 24, '9L9ScEj2', 0, 0], [1558, 400, 1345, 63, 24, 'S8YyUDnXd', 0, 0], [1993, 91, 1345, 62, 24, '4P', 0, 0], [1555, 102, 1605, 35, 25, 'kbGP', 0, 2], [1674, 371, 1605, 44, 25, 'DO1tvoEyiX9AVz6Q', 0, 2], [2062, 147, 1605, 44, 25, 'DtQAa3', 2, 2], [1554, 53, 1669, 35, 26, 'pg', 0, 2], [1624, 104, 1660, 34, 26, 'ZPsJ', 0, 2], [1746, 221, 1659, 38, 26, '7CBPYAUA', 0, 2], [1987, 50, 1657, 46, 26, 'AL', 0, 2], [1555, 407, 1714, 44, 27, 'LA3ShdHUE3DAoOkfiB', 0, 2], [188, 1826, 2340, 3, 29, '4', 0, 0], [2024, 217, 2309, 34, 28, 'DLpZXhKepjdcyW', 0, 0], [2239, 119, 2310, 33, 28, '28otEfj9', 0, 0], [230, 77, 2349, 23, 29, 'Th1YC4R', 0, 0], [476, 89, 2349, 18, 29, 'uFRt5qEx', 0, 0], [1140, 463, 2388, 35, 30, 'Mxcsoj1MOubuEB33', 0, 0], [1708, 40, 2372, 17, 30, 'OfA', 0, 9], [1758, 81, 2372, 22, 30, 'ZQoO7mwr', 0, 9], [1848, 3, 2372, 17, 30, 'M', 0, 9], [1860, 134, 2372, 22, 30, 'IvtUnQ4Zxc29A', 0, 9], [2002, 20, 2376, 13, 30, '3V', 0, 9], [2029, 32, 2372, 17, 30, '6t8', 0, 9], [2070, 133, 2372, 17, 30, 'PdCWscuWGHR', 0, 9], [1709, 171, 2398, 22, 30, 'RsW4Oj1Lhf1ljQV4G', 0, 9], [1890, 148, 2398, 22, 30, 'VSUJUa3tuYIhiXxP', 9, 9], [2048, 34, 2398, 17, 30, 'aAm', 0, 9], [2089, 21, 2403, 12, 30, 'uY', 0, 9], [2118, 53, 2398, 17, 30, '6DDFv', 0, 9], [2179, 28, 2398, 17, 30, 'DKJ', 0, 9], [2214, 66, 2398, 17, 30, 'NBmY9BD', 0, 9], [2289, 57, 2398, 18, 30, 'sYsrT', 0, 9], [1708, 25, 2425, 17, 31, 'jGk', 0, 9], [1736, 34, 2429, 13, 31, 'oX', 0, 9], [1778, 93, 2425, 17, 31, 'OvpfEyhHso', 0, 9], [120, 131, 2510, 23, 32, 'rZCsYsA6im2b', 0, 0], [260, 25, 2515, 18, 32, 'G6', 0, 0], [295, 107, 2510, 18, 32, 'd6eYwhzZuS', 0, 0], [132, 88, 2582, 22, 34, 'Xc84', 3, 3], [231, 223, 2582, 22, 34, 'MnMcBUHVmhl2', 0, 3], [463, 47, 2582, 22, 34, 'Vto', 0, 3], [132, 194, 2616, 22, 35, 'B4f1f4KpCHC', 0, 3], [338, 14, 2616, 22, 35, 'W', 0, 3], [131, 64, 2650, 22, 36, 'UW6t', 0, 3], [216, 181, 2650, 22, 36, 'hLULWi7xdj', 0, 3], [1044, 175, 2510, 18, 32, 'F9f7jvsfmjnXbK', 0, 0], [1226, 25, 2515, 18, 32, 'Vk', 0, 0], [1261, 177, 2510, 23, 32, 'TBlYLSoItzHKpG', 0, 0], [1054, 132, 2544, 22, 33, 'u4vvPgHd', 0, 0], [1053, 36, 2590, 21, 34, 'lN', 0, 4], [1101, 107, 2589, 23, 34, 'ieee4D', 0, 4], [1218, 47, 2589, 23, 34, 'kD6', 0, 4], [1054, 122, 2623, 23, 35, 'Ngf2xWa', 0, 4], [1189, 132, 2624, 22, 35, 'N27RyHsP', 0, 4], [1054, 204, 2657, 23, 36, 'e97JFxWTXfS', 0, 4], [1262, 43, 2658, 22, 36, 'p', 4, 4], [1054, 65, 2692, 22, 37, 'mle1', 0, 4], [1139, 186, 2691, 23, 37, 'o6tA5wFrK', 0, 4], [1337, 39, 2691, 23, 37, 'W3', 0, 4], [1709, 175, 2510, 18, 32, 'DQm27gIhcjmkdB', 0, 0], [1892, 25, 2515, 18, 32, '4Z', 0, 0], [1927, 176, 2510, 23, 32, 'rAP1PxzMyqkxdY', 0, 0], [1720, 132, 2544, 22, 33, 'JpsQeikW', 0, 0], [1719, 35, 2590, 21, 34, 'hD', 0, 5], [1766, 107, 2589, 23, 34, '3vzIwR', 0, 5], [1884, 47, 2589, 23, 34, 'kHw', 0, 5], [1720, 122, 2623, 23, 35, 'MYOKedL', 0, 5], [1854, 132, 2624, 22, 35, 'K8JXFVII', 5, 5], [1720, 204, 2657, 23, 36, 'bBkPRmgyfVp', 0, 5], [1928, 43, 2658, 22, 36, 'j', 0, 5], [1719, 65, 2692, 22, 37, 'RfU4', 0, 5], [1805, 185, 2691, 23, 37, 'wtK1L23Q4', 0, 5], [2003, 38, 2692, 22, 37, 'yY', 0, 5], [130, 255, 2804, 23, 38, 'jgoGjNh2DoLnb2b4PGonGvU', 0, 0], [1044, 117, 2804, 18, 38, 'qGXS7f7gRHy', 0, 0], [1168, 38, 2804, 18, 38, 'UQI', 0, 0], [1215, 102, 2804, 18, 38, 'P764bscKkx', 0, 0], [1320, 38, 2804, 18, 38, 'OtH', 0, 0], [1368, 58, 2804, 18, 38, 'VhrUJ', 0, 0], [1709, 100, 2804, 23, 38, 'zjQgoufCGU', 0, 0], [131, 55, 2852, 21, 40, 'piH', 0, 0], [198, 41, 2858, 15, 40, 'wU6P', 0, 0], [281, 124, 2852, 21, 40, 'riQCT4RX', 0, 0], [454, 138, 2852, 27, 40, 'jSAJPlWhyRE', 0, 0], [612, 77, 2852, 21, 40, 'nVS97', 0, 0], [131, 227, 2886, 21, 41, 'zExU7Poi4QW', 0, 0], [375, 235, 2886, 21, 41, 'pLTfHVP1qzb7Mh2', 0, 0], [138, 100, 2957, 15, 42, 'fv8', 0, 0], [1404, 4, 2978, 4, 42, 'B', 0, 0], [130, 103, 2975, 34, 42, 'qpg', 0, 0], [253, 252, 2974, 19, 42, 'T9SOmYWl4CUrdt8o', 0, 0], [1078, 3, 2972, 40, 42, 'S5', 0, 0], [1103, 62, 2978, 28, 42, 'L6W', 0, 0], [1181, 56, 2978, 28, 42, 'ep1', 0, 0], [1253, 118, 2978, 28, 42, 'oKhrqlI', 0, 0], [1384, 45, 2985, 21, 42, 'OyP', 0, 0], [1444, 132, 2978, 28, 42, 'mvg8Bw5', 0, 0], [1593, 55, 2972, 76, 42, 'eG', 0, 0], [218, 5, 3074, 18, 44, 'z', 0, 0], [231, 72, 3058, 18, 44, 'x1Pat7', 0, 0], [605, 5, 3074, 18, 44, 'P', 0, 0], [617, 39, 3058, 18, 44, 'dNT', 0, 0], [1053, 146, 3058, 23, 44, 'q7CLeOJhnI1oa', 0, 0], [1802, 5, 3074, 18, 44, '6', 0, 0], [1815, 72, 3058, 18, 44, 'acKa9h', 0, 0], [2119, 50, 3057, 35, 44, 'uGH', 0, 0], [461, 129, 3125, 29, 45, 'p6L5U', 0, 0], [623, 44, 3125, 29, 45, 'dC', 0, 0], [1046, 266, 3125, 29, 45, '9HBoqUyRbg', 0, 0], [1975, 129, 3125, 29, 45, 'qH1ph', 0, 0], [2136, 45, 3125, 29, 45, 'gG', 0, 0], [218, 5, 3183, 20, 46, 'j', 0, 0], [605, 5, 3183, 20, 46, 'o', 0, 0], [119, 24, 3213, 18, 47, 'QDN', 0, 8], [153, 94, 3213, 18, 47, 'EleVpvP4', 0, 8], [256, 105, 3213, 23, 47, 'dq9L2xQO7', 0, 8], [370, 7, 3223, 2, 47, 'n', 0, 8], [386, 69, 3212, 24, 47, 'L9EKl', 0, 8], [464, 83, 3213, 23, 47, 'AnF2rBIN', 0, 8], [555, 19, 3214, 17, 47, 'k6', 0, 8], [582, 62, 3213, 18, 47, 'y3M3kx', 8, 8], [654, 2, 3213, 18, 47, '1', 0, 8], [666, 139, 3212, 19, 47, 'SkmavPFrrrSv', 0, 8], [808, 52, 3213, 18, 47, 'bJ5S', 0, 8], [200, 100, 3316, 29, 50, 'NmNa', 0, 7], [336, 675, 3316, 29, 50, 'vB759g8XWkL7XXe5tCHZs7tAF', 7, 7], [1046, 42, 3203, 23, 47, 'v4T', 0, 0], [1095, 150, 3202, 19, 47, 'NH7vM6', 0, 0], [1251, 24, 3199, 22, 47, '47', 0, 0], [1802, 5, 3183, 20, 46, 'B', 0, 0], [2119, 5, 3183, 20, 46, 'b', 0, 0], [1714, 254, 3213, 23, 47, '2Za9eGyQyKp4S2rVYahzJNM', 0, 0], [1715, 55, 3261, 21, 48, 'djv', 0, 6], [1781, 41, 3267, 15, 48, '3WHD', 0, 6], [1864, 124, 3261, 21, 48, '8ucAV2oj', 0, 6], [2037, 139, 3261, 27, 48, 'baUoLawp6rY', 0, 6], [2196, 76, 3261, 21, 48, 'sRheu', 6, 6], [1715, 226, 3295, 21, 49, 'hAfhkKsI7Jx', 0, 6], [1959, 234, 3295, 21, 49, 'quecbSW4gEdjSGG', 0, 6], [1715, 176, 3329, 27, 50, 'ciaZR8NxiuEXr1', 0, 6], [1910, 140, 3329, 21, 50, 'vicUyHPNcN', 0, 6]]
    data_pd =  pd.DataFrame(data_array,columns=["left", "width", "top", "height", "lineNr", "randomWord", "keyWord", "keyWordGroup"])
    
    
    ### group words to keywords
    from scipy.cluster.vq import vq
    keywords = pd.concat([data_pd[data_pd.keyWord!=0].left + data_pd[data_pd.keyWord!=0].width/2, data_pd[data_pd.keyWord!=0].top + data_pd[data_pd.keyWord!=0].height/2], axis=1)
    words    = pd.concat([data_pd[data_pd.keyWord==0].left + data_pd[data_pd.keyWord==0].width/2, data_pd[data_pd.keyWord==0].top + data_pd[data_pd.keyWord==0].height/2], axis=1)
    res = vq(words.to_numpy(),keywords.to_numpy())
    
    ### remove outliers
    import numpy as np
    factor = 1.5
    limits = []
    # calculate limit as limit = mean + factor * stddev for each keyWord
    for i in range(len(keywords)):
        limits.append(np.mean(res[1][res[0]==i]) + factor * np.std(res[1][res[0]==i]))
    
    # mark words with distance > limit as outliers
    for i in range(len(res[0])):
        if res[1][i] > limits[res[0][i]]:
            res[0][i] = -1
    
    ### assign results to dataframe
    words['keyWordGroupNew'] = res[0] + 1
    keywords['keyWordGroupNew'] = range(1, len(keywords) + 1)
    data_pd = pd.concat([data_pd, pd.concat([words['keyWordGroupNew'],keywords['keyWordGroupNew']])], axis=1, join='outer')
    
    # renumber keyWordGroup according to keyWord numbering
    dic = dict(zip(range(1, len(keywords) + 1), data_pd[data_pd.keyWord!=0]['keyWord']))
    dic[0] = 0
    data_pd.keyWordGroupNew = data_pd.keyWordGroupNew.map(dic)
    
    
    from PIL import Image, ImageFont, ImageDraw # pip install Pillow
    import random
    
    #Create a empty image object
    new_im = Image.new('RGB', ((data_pd["left"]+data_pd["width"]).max() + data_pd["left"].min(), (data_pd["top"]+data_pd["height"]).max()  + data_pd["top"].min() ), (255,255,255))
    draw_new_im = ImageDraw.Draw(new_im)
    
    #Create a dictioanry with random colors to assign for each uniq keyWordGroupNew
    uniqGroups = data_pd["keyWordGroupNew"].unique()
    colors = {}
    i = 0
    for g in uniqGroups:
        if(g == 0):
            colors[str(g)] = "black" # assign black color for non groups
        else:
            colors[str(g)] =  "hsl(" + str(70 + i * 290 / (len(uniqGroups) - 2)) + ",100%,50%)"
            i += 1
    
    #Write text to the image
    for i, row in data_pd.iterrows():
        draw_new_im.text((int(row["left"]), int(row["top"])), str(row["randomWord"]) ,  fill=colors[str(row["keyWordGroupNew"])],font=ImageFont.truetype("arial.ttf", int(row["height"])))
        if row["keyWord"] > 0:
            draw_new_im.rectangle([row["left"] ,row["top"], row["left"]+row["width"], row["top"]+row["height"]], outline=colors[str(row["keyWordGroupNew"])])
    
    #Save the image
    new_im.save("out-std.jpg")
    

    As you see in the code, I also made two small improvements to the image generation: uniformly distribute the colors over the hue range from yellow to red and draw frames around the keywords.

    Another approach to outlier detection is called local outlier factor. This technique marks isolated words thar are not surrounded by other group members as outliers.

    ### group words to keywords
    from scipy.cluster.vq import vq
    keywords = pd.concat([data_pd[data_pd.keyWord!=0].left + data_pd[data_pd.keyWord!=0].width/2, data_pd[data_pd.keyWord!=0].top + data_pd[data_pd.keyWord!=0].height/2], axis=1)
    words    = pd.concat([data_pd[data_pd.keyWord==0].left + data_pd[data_pd.keyWord==0].width/2, data_pd[data_pd.keyWord==0].top + data_pd[data_pd.keyWord==0].height/2], axis=1)
    res = vq(words.to_numpy(),keywords.to_numpy())
    
    # assign results
    words['keyWordGroupNew'] = res[0] + 1
    keywords['keyWordGroupNew'] = range(1, len(keywords) + 1)
    
    ### remove outliers
    from sklearn.neighbors import LocalOutlierFactor
    clf = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
    for i in range(1, len(keywords)+1):
        y_pred = clf.fit_predict(words[words.keyWordGroupNew==i].iloc[:,0:2].to_numpy())
        words.loc[words[words.keyWordGroupNew==i][y_pred==-1].index,'keyWordGroupNew'] = 0   # mark as outlier
    
    ### save results to dataframe
    data_pd = pd.concat([data_pd, pd.concat([words['keyWordGroupNew'],keywords['keyWordGroupNew']])], axis=1, join='outer')
    
    # renumber keyWordGroup according to keyWord numbering
    dic = dict(zip(range(1, len(keywords) + 1), data_pd[data_pd.keyWord!=0]['keyWord']))
    dic[0] = 0
    data_pd.keyWordGroupNew = data_pd.keyWordGroupNew.map(dic)
    
    # image generation as in previous example ...
    

    This doesn't work so well for relatively small groups and the results are visually not so good as with the other method when the keyword is located outside the center of the group. The result for contamination = 0.1, which is a commonly used value, is here. For details see the original paper and the sklearn docs.

    Conclusion: both approached give satisfactory results that can be tuned by adjusting the factor x or contamination respectively.