根据关键字和位置数据确定文档中的词块/词组?

2024-04-23 08:05:50 发布

您现在位置:Python中文网/ 问答频道 /正文

考虑到这一点,我们有下面的输入数据表。你知道吗

import pandas as pd
#Pandas settings to see all the data when printing
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.width', 500)

#Load the data
data_array = [[576, 60, 279, 28, 2, 'LzR', 0, 0], [578, 17, 318, 23, 3, 'U', 0, 0], [371, 21, 279, 24, 2, 'K', 0, 0], [373, 134, 317, 25, 3, 'mq77MJc', 0, 0], [537, 32, 317, 25, 3, '53', 0, 0], [373, 201, 355, 25, 4, '7Q7NZzkAzN', 0, 0], [538, 118, 393, 24, 5, 'oNNbgA', 0, 0], [680, 39, 392, 26, 5, 'J9', 0, 0], [1509, 155, 260, 154, 2, 'd', 0, 0], [1731, 98, 268, 123, 2, 'z8', 0, 0], [1876, 385, 271, 120, 2, 'rUqNDY', 0, 0], [1640, 197, 590, 21, 7, 't5gNVHDXQVJ', 0, 0], [1989, 270, 589, 22, 7, 't3I81fBOE9caUfb', 0, 0], [352, 80, 645, 25, 8, 'i5f3', 0, 1], [454, 245, 645, 25, 8, 'KrqcRA7Se7X7', 1, 1], [719, 60, 645, 27, 8, 'bpN', 0, 1], [1640, 161, 642, 22, 8, 'skAzt6Np4', 0, 0], [1822, 51, 643, 21, 8, 'K59', 0, 0], [2082, 177, 642, 22, 8, 'cwyN7wsMhE', 0, 0], [353, 220, 683, 25, 9, 'O8coFUwMUbE', 0, 1], [597, 17, 683, 25, 9, 'L', 0, 1], [1640, 234, 695, 22, 9, 'oVWEKowWnbT2y', 0, 0], [2080, 179, 695, 22, 9, 'FvjigCiC7h', 0, 0], [351, 79, 721, 24, 10, 'OQN3', 0, 1], [476, 202, 720, 25, 10, 'S2gcfJIDze', 0, 1], [2062, 69, 775, 22, 11, 'n9lN', 0, 0], [2155, 8, 775, 21, 11, 'G', 0, 0], [2188, 35, 775, 21, 11, '9X', 0, 0], [2246, 8, 775, 21, 11, 'v', 0, 0], [353, 81, 1003, 21, 13, 'c7ox8', 0, 0], [461, 325, 1003, 22, 13, 'o9GmMYAW4RrpPBY64p', 0, 0], [351, 101, 1037, 22, 14, '9NF7ii', 0, 0], [477, 146, 1037, 21, 14, 'MwTlIkU9', 0, 0], [350, 70, 1071, 22, 15, 'J5XF', 0, 0], [443, 87, 1071, 22, 15, '3m4tM', 0, 0], [553, 32, 1071, 22, 15, 'Ck', 0, 0], [609, 10, 1071, 22, 15, '5', 0, 0], [643, 53, 1071, 22, 15, 'X7Y', 0, 0], [1568, 135, 1092, 20, 16, 'P4', 0, 0], [352, 142, 1105, 22, 16, 'Pjs1GYSG', 0, 0], [516, 45, 1105, 22, 16, 'o9V', 0, 0], [588, 106, 1105, 22, 16, 'WRI8oY', 0, 0], [1563, 132, 1117, 20, 16, '3cZY', 0, 0], [350, 69, 1140, 21, 17, 'GW3y', 0, 0], [441, 35, 1139, 22, 17, 'EO', 0, 0], [497, 51, 1139, 28, 17, 'toN', 0, 0], [570, 49, 1140, 21, 17, 'k11', 0, 0], [643, 51, 1139, 22, 17, 'pod', 0, 0], [715, 89, 1140, 21, 17, '6SQfv', 0, 0], [825, 83, 1139, 22, 17, 'CzC2M', 0, 0], [934, 102, 1140, 21, 17, 'aowjQC', 0, 0], [1062, 51, 1140, 21, 17, 'BtC', 0, 0], [1558, 136, 1142, 20, 17, 'XhJ', 0, 0], [1722, 336, 1115, 25, 16, 'OgtXP2nxOwP7Gb3I', 0, 0], [352, 125, 1174, 21, 18, 'zYmvutc', 0, 0], [498, 45, 1174, 21, 18, 'JvN', 0, 0], [570, 124, 1174, 21, 18, 'TyZdJG4', 0, 0], [352, 64, 1207, 22, 19, 'Lvam', 0, 0], [443, 45, 1208, 21, 19, 'Onk', 0, 0], [516, 123, 1208, 21, 19, 'bPgi7tF', 0, 0], [1946, 12, 1231, 11, 20, 'I', 0, 0], [351, 106, 1241, 23, 20, 'xbAa7n', 0, 0], [479, 306, 1242, 22, 20, 'NEn7uifO17vkyzVVp', 0, 0], [1300, 142, 1242, 27, 20, 'dZukserV', 0, 0], [352, 178, 1275, 34, 21, 'qrxWKyJjjn', 0, 0], [557, 60, 1275, 28, 21, '2Ri5', 0, 0], [1354, 88, 1276, 27, 21, 'ZCp3F', 0, 0], [1558, 197, 1231, 63, 20, 'YgoGs', 0, 0], [1787, 96, 1247, 63, 20, 'Um', 0, 0], [1913, 268, 1231, 63, 20, 'YL7fkaV', 0, 0], [351, 70, 1309, 23, 22, 'kcGD', 0, 0], [443, 142, 1309, 23, 22, 'lGAx6Ljx', 0, 0], [605, 35, 1310, 21, 22, 'Hm', 0, 0], [661, 142, 1310, 27, 22, 'S8gZ5tPE', 0, 0], [1302, 135, 1310, 27, 22, 'gjgVPImz', 0, 0], [1743, 12, 1329, 11, 23, 'Z', 0, 0], [2055, 16, 1324, 17, 23, 'i', 0, 0], [353, 11, 1344, 21, 24, 'L', 0, 0], [386, 53, 1344, 21, 24, 'Q5J', 0, 0], [1300, 142, 1344, 27, 24, '9L9ScEj2', 0, 0], [1558, 400, 1345, 63, 24, 'S8YyUDnXd', 0, 0], [1993, 91, 1345, 62, 24, '4P', 0, 0], [1555, 102, 1605, 35, 25, 'kbGP', 0, 2], [1674, 371, 1605, 44, 25, 'DO1tvoEyiX9AVz6Q', 0, 2], [2062, 147, 1605, 44, 25, 'DtQAa3', 2, 2], [1554, 53, 1669, 35, 26, 'pg', 0, 2], [1624, 104, 1660, 34, 26, 'ZPsJ', 0, 2], [1746, 221, 1659, 38, 26, '7CBPYAUA', 0, 2], [1987, 50, 1657, 46, 26, 'AL', 0, 2], [1555, 407, 1714, 44, 27, 'LA3ShdHUE3DAoOkfiB', 0, 2], [188, 1826, 2340, 3, 29, '4', 0, 0], [2024, 217, 2309, 34, 28, 'DLpZXhKepjdcyW', 0, 0], [2239, 119, 2310, 33, 28, '28otEfj9', 0, 0], [230, 77, 2349, 23, 29, 'Th1YC4R', 0, 0], [476, 89, 2349, 18, 29, 'uFRt5qEx', 0, 0], [1140, 463, 2388, 35, 30, 'Mxcsoj1MOubuEB33', 0, 0], [1708, 40, 2372, 17, 30, 'OfA', 0, 9], [1758, 81, 2372, 22, 30, 'ZQoO7mwr', 0, 9], [1848, 3, 2372, 17, 30, 'M', 0, 9], [1860, 134, 2372, 22, 30, 'IvtUnQ4Zxc29A', 0, 9], [2002, 20, 2376, 13, 30, '3V', 0, 9], [2029, 32, 2372, 17, 30, '6t8', 0, 9], [2070, 133, 2372, 17, 30, 'PdCWscuWGHR', 0, 9], [1709, 171, 2398, 22, 30, 'RsW4Oj1Lhf1ljQV4G', 0, 9], [1890, 148, 2398, 22, 30, 'VSUJUa3tuYIhiXxP', 9, 9], [2048, 34, 2398, 17, 30, 'aAm', 0, 9], [2089, 21, 2403, 12, 30, 'uY', 0, 9], [2118, 53, 2398, 17, 30, '6DDFv', 0, 9], [2179, 28, 2398, 17, 30, 'DKJ', 0, 9], [2214, 66, 2398, 17, 30, 'NBmY9BD', 0, 9], [2289, 57, 2398, 18, 30, 'sYsrT', 0, 9], [1708, 25, 2425, 17, 31, 'jGk', 0, 9], [1736, 34, 2429, 13, 31, 'oX', 0, 9], [1778, 93, 2425, 17, 31, 'OvpfEyhHso', 0, 9], [120, 131, 2510, 23, 32, 'rZCsYsA6im2b', 0, 0], [260, 25, 2515, 18, 32, 'G6', 0, 0], [295, 107, 2510, 18, 32, 'd6eYwhzZuS', 0, 0], [132, 88, 2582, 22, 34, 'Xc84', 3, 3], [231, 223, 2582, 22, 34, 'MnMcBUHVmhl2', 0, 3], [463, 47, 2582, 22, 34, 'Vto', 0, 3], [132, 194, 2616, 22, 35, 'B4f1f4KpCHC', 0, 3], [338, 14, 2616, 22, 35, 'W', 0, 3], [131, 64, 2650, 22, 36, 'UW6t', 0, 3], [216, 181, 2650, 22, 36, 'hLULWi7xdj', 0, 3], [1044, 175, 2510, 18, 32, 'F9f7jvsfmjnXbK', 0, 0], [1226, 25, 2515, 18, 32, 'Vk', 0, 0], [1261, 177, 2510, 23, 32, 'TBlYLSoItzHKpG', 0, 0], [1054, 132, 2544, 22, 33, 'u4vvPgHd', 0, 0], [1053, 36, 2590, 21, 34, 'lN', 0, 4], [1101, 107, 2589, 23, 34, 'ieee4D', 0, 4], [1218, 47, 2589, 23, 34, 'kD6', 0, 4], [1054, 122, 2623, 23, 35, 'Ngf2xWa', 0, 4], [1189, 132, 2624, 22, 35, 'N27RyHsP', 0, 4], [1054, 204, 2657, 23, 36, 'e97JFxWTXfS', 0, 4], [1262, 43, 2658, 22, 36, 'p', 4, 4], [1054, 65, 2692, 22, 37, 'mle1', 0, 4], [1139, 186, 2691, 23, 37, 'o6tA5wFrK', 0, 4], [1337, 39, 2691, 23, 37, 'W3', 0, 4], [1709, 175, 2510, 18, 32, 'DQm27gIhcjmkdB', 0, 0], [1892, 25, 2515, 18, 32, '4Z', 0, 0], [1927, 176, 2510, 23, 32, 'rAP1PxzMyqkxdY', 0, 0], [1720, 132, 2544, 22, 33, 'JpsQeikW', 0, 0], [1719, 35, 2590, 21, 34, 'hD', 0, 5], [1766, 107, 2589, 23, 34, '3vzIwR', 0, 5], [1884, 47, 2589, 23, 34, 'kHw', 0, 5], [1720, 122, 2623, 23, 35, 'MYOKedL', 0, 5], [1854, 132, 2624, 22, 35, 'K8JXFVII', 5, 5], [1720, 204, 2657, 23, 36, 'bBkPRmgyfVp', 0, 5], [1928, 43, 2658, 22, 36, 'j', 0, 5], [1719, 65, 2692, 22, 37, 'RfU4', 0, 5], [1805, 185, 2691, 23, 37, 'wtK1L23Q4', 0, 5], [2003, 38, 2692, 22, 37, 'yY', 0, 5], [130, 255, 2804, 23, 38, 'jgoGjNh2DoLnb2b4PGonGvU', 0, 0], [1044, 117, 2804, 18, 38, 'qGXS7f7gRHy', 0, 0], [1168, 38, 2804, 18, 38, 'UQI', 0, 0], [1215, 102, 2804, 18, 38, 'P764bscKkx', 0, 0], [1320, 38, 2804, 18, 38, 'OtH', 0, 0], [1368, 58, 2804, 18, 38, 'VhrUJ', 0, 0], [1709, 100, 2804, 23, 38, 'zjQgoufCGU', 0, 0], [131, 55, 2852, 21, 40, 'piH', 0, 0], [198, 41, 2858, 15, 40, 'wU6P', 0, 0], [281, 124, 2852, 21, 40, 'riQCT4RX', 0, 0], [454, 138, 2852, 27, 40, 'jSAJPlWhyRE', 0, 0], [612, 77, 2852, 21, 40, 'nVS97', 0, 0], [131, 227, 2886, 21, 41, 'zExU7Poi4QW', 0, 0], [375, 235, 2886, 21, 41, 'pLTfHVP1qzb7Mh2', 0, 0], [138, 100, 2957, 15, 42, 'fv8', 0, 0], [1404, 4, 2978, 4, 42, 'B', 0, 0], [130, 103, 2975, 34, 42, 'qpg', 0, 0], [253, 252, 2974, 19, 42, 'T9SOmYWl4CUrdt8o', 0, 0], [1078, 3, 2972, 40, 42, 'S5', 0, 0], [1103, 62, 2978, 28, 42, 'L6W', 0, 0], [1181, 56, 2978, 28, 42, 'ep1', 0, 0], [1253, 118, 2978, 28, 42, 'oKhrqlI', 0, 0], [1384, 45, 2985, 21, 42, 'OyP', 0, 0], [1444, 132, 2978, 28, 42, 'mvg8Bw5', 0, 0], [1593, 55, 2972, 76, 42, 'eG', 0, 0], [218, 5, 3074, 18, 44, 'z', 0, 0], [231, 72, 3058, 18, 44, 'x1Pat7', 0, 0], [605, 5, 3074, 18, 44, 'P', 0, 0], [617, 39, 3058, 18, 44, 'dNT', 0, 0], [1053, 146, 3058, 23, 44, 'q7CLeOJhnI1oa', 0, 0], [1802, 5, 3074, 18, 44, '6', 0, 0], [1815, 72, 3058, 18, 44, 'acKa9h', 0, 0], [2119, 50, 3057, 35, 44, 'uGH', 0, 0], [461, 129, 3125, 29, 45, 'p6L5U', 0, 0], [623, 44, 3125, 29, 45, 'dC', 0, 0], [1046, 266, 3125, 29, 45, '9HBoqUyRbg', 0, 0], [1975, 129, 3125, 29, 45, 'qH1ph', 0, 0], [2136, 45, 3125, 29, 45, 'gG', 0, 0], [218, 5, 3183, 20, 46, 'j', 0, 0], [605, 5, 3183, 20, 46, 'o', 0, 0], [119, 24, 3213, 18, 47, 'QDN', 0, 8], [153, 94, 3213, 18, 47, 'EleVpvP4', 0, 8], [256, 105, 3213, 23, 47, 'dq9L2xQO7', 0, 8], [370, 7, 3223, 2, 47, 'n', 0, 8], [386, 69, 3212, 24, 47, 'L9EKl', 0, 8], [464, 83, 3213, 23, 47, 'AnF2rBIN', 0, 8], [555, 19, 3214, 17, 47, 'k6', 0, 8], [582, 62, 3213, 18, 47, 'y3M3kx', 8, 8], [654, 2, 3213, 18, 47, '1', 0, 8], [666, 139, 3212, 19, 47, 'SkmavPFrrrSv', 0, 8], [808, 52, 3213, 18, 47, 'bJ5S', 0, 8], [200, 100, 3316, 29, 50, 'NmNa', 0, 7], [336, 675, 3316, 29, 50, 'vB759g8XWkL7XXe5tCHZs7tAF', 7, 7], [1046, 42, 3203, 23, 47, 'v4T', 0, 0], [1095, 150, 3202, 19, 47, 'NH7vM6', 0, 0], [1251, 24, 3199, 22, 47, '47', 0, 0], [1802, 5, 3183, 20, 46, 'B', 0, 0], [2119, 5, 3183, 20, 46, 'b', 0, 0], [1714, 254, 3213, 23, 47, '2Za9eGyQyKp4S2rVYahzJNM', 0, 0], [1715, 55, 3261, 21, 48, 'djv', 0, 6], [1781, 41, 3267, 15, 48, '3WHD', 0, 6], [1864, 124, 3261, 21, 48, '8ucAV2oj', 0, 6], [2037, 139, 3261, 27, 48, 'baUoLawp6rY', 0, 6], [2196, 76, 3261, 21, 48, 'sRheu', 6, 6], [1715, 226, 3295, 21, 49, 'hAfhkKsI7Jx', 0, 6], [1959, 234, 3295, 21, 49, 'quecbSW4gEdjSGG', 0, 6], [1715, 176, 3329, 27, 50, 'ciaZR8NxiuEXr1', 0, 6], [1910, 140, 3329, 21, 50, 'vicUyHPNcN', 0, 6]]
data_pd =  pd.DataFrame(data_array,columns=["left", "width", "top", "height", "lineNr", "randomWord", "keyWord", "keyWordGroup"])
print(data_pd)

该表包含一个主列randomWord和几个其他列,其中包含文档中每个单词的位置坐标。你知道吗

帮助可视化数据。我写了这段代码,它可以从表中生成一个图像,以便更好地可视化和理解问题

from PIL import Image, ImageFont, ImageDraw # pip install Pillow
import random

#Create a empty image object
new_im = Image.new('RGB', ((data_pd["left"]+data_pd["width"]).max() + data_pd["left"].min(), (data_pd["top"]+data_pd["height"]).max()  + data_pd["top"].min() ), (255,255,255))
draw_new_im = ImageDraw.Draw(new_im)

#Create a dictioanry with random colors to assign for each uniq keyWordGroup
uniqGroups = data_pd["keyWordGroup"].unique()
colors = {}
for g in uniqGroups:
    if(g == 0):
        colors[str(g)] = "black" # assign black color for non groups
    else:
        colors[str(g)] =  "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) #generate random color

#Write text to the image
for i, row in data_pd.iterrows():
    draw_new_im.text((int(row["left"]), int(row["top"])), str(row["randomWord"]) ,  fill=colors[str(row["keyWordGroup"])],font=ImageFont.truetype("arial.ttf", int(row["height"])))

#Save the image
new_im.save("TestImage.jpg")

如您所见,我们有keyWord列。此列包含一些关键字的ID,我们需要这些关键字来查找与它们所属的文本块/文本组最近的关键字。你知道吗

这篇文章的问题如下:我们如何在keyWord列中识别最接近关键字的文本组/块?正如您在生成的图像中所看到的,对于每个keyWordID,我们会找到所有邻近的单词,并形成一个文本块。你知道吗

我要查找的输出在keyWordGroup列中,这是一个将单词分配给关键字的示例。你知道吗

有没有什么方法可以根据关键字和其他位置数据来查找这些文本块?你知道吗


Tags: the文本newfordatatoprandom关键字
1条回答
网友
1楼 · 发布于 2024-04-23 08:05:50

解决方案包括两个步骤:

  1. 将单词分组到最接近的关键字(我不会称之为聚类,因为这里已经给出了组的中心,而不是在没有先验已知位置的情况下尝试查找聚类的聚类)
  2. 删除似乎不属于该关键字的异常值,尽管该关键字是距离最近的关键字。你知道吗

通过使用vector quantization按距离分配关键字编号,分组很简单。这里我们唯一要记住的是,原始数据帧中的关键字编号不是按顺序出现的,而是按从0开始的顺序出现在vq组中。这就是为什么我们必须将新的关键字组编号映射到给定的关键字编号。你知道吗

去除异常值可以用不同的方法来完成,并且在如何形成关键字组的问题上没有严格的要求。我选择了一个非常简单的方法:取关键字到所有关键字组成员的距离的平均值和标准差,并将距离大于mean+x*StdDev的单词视为异常值。选择x=1.5得到good results。你知道吗

import pandas as pd

#Load the data
data_array = [[576, 60, 279, 28, 2, 'LzR', 0, 0], [578, 17, 318, 23, 3, 'U', 0, 0], [371, 21, 279, 24, 2, 'K', 0, 0], [373, 134, 317, 25, 3, 'mq77MJc', 0, 0], [537, 32, 317, 25, 3, '53', 0, 0], [373, 201, 355, 25, 4, '7Q7NZzkAzN', 0, 0], [538, 118, 393, 24, 5, 'oNNbgA', 0, 0], [680, 39, 392, 26, 5, 'J9', 0, 0], [1509, 155, 260, 154, 2, 'd', 0, 0], [1731, 98, 268, 123, 2, 'z8', 0, 0], [1876, 385, 271, 120, 2, 'rUqNDY', 0, 0], [1640, 197, 590, 21, 7, 't5gNVHDXQVJ', 0, 0], [1989, 270, 589, 22, 7, 't3I81fBOE9caUfb', 0, 0], [352, 80, 645, 25, 8, 'i5f3', 0, 1], [454, 245, 645, 25, 8, 'KrqcRA7Se7X7', 1, 1], [719, 60, 645, 27, 8, 'bpN', 0, 1], [1640, 161, 642, 22, 8, 'skAzt6Np4', 0, 0], [1822, 51, 643, 21, 8, 'K59', 0, 0], [2082, 177, 642, 22, 8, 'cwyN7wsMhE', 0, 0], [353, 220, 683, 25, 9, 'O8coFUwMUbE', 0, 1], [597, 17, 683, 25, 9, 'L', 0, 1], [1640, 234, 695, 22, 9, 'oVWEKowWnbT2y', 0, 0], [2080, 179, 695, 22, 9, 'FvjigCiC7h', 0, 0], [351, 79, 721, 24, 10, 'OQN3', 0, 1], [476, 202, 720, 25, 10, 'S2gcfJIDze', 0, 1], [2062, 69, 775, 22, 11, 'n9lN', 0, 0], [2155, 8, 775, 21, 11, 'G', 0, 0], [2188, 35, 775, 21, 11, '9X', 0, 0], [2246, 8, 775, 21, 11, 'v', 0, 0], [353, 81, 1003, 21, 13, 'c7ox8', 0, 0], [461, 325, 1003, 22, 13, 'o9GmMYAW4RrpPBY64p', 0, 0], [351, 101, 1037, 22, 14, '9NF7ii', 0, 0], [477, 146, 1037, 21, 14, 'MwTlIkU9', 0, 0], [350, 70, 1071, 22, 15, 'J5XF', 0, 0], [443, 87, 1071, 22, 15, '3m4tM', 0, 0], [553, 32, 1071, 22, 15, 'Ck', 0, 0], [609, 10, 1071, 22, 15, '5', 0, 0], [643, 53, 1071, 22, 15, 'X7Y', 0, 0], [1568, 135, 1092, 20, 16, 'P4', 0, 0], [352, 142, 1105, 22, 16, 'Pjs1GYSG', 0, 0], [516, 45, 1105, 22, 16, 'o9V', 0, 0], [588, 106, 1105, 22, 16, 'WRI8oY', 0, 0], [1563, 132, 1117, 20, 16, '3cZY', 0, 0], [350, 69, 1140, 21, 17, 'GW3y', 0, 0], [441, 35, 1139, 22, 17, 'EO', 0, 0], [497, 51, 1139, 28, 17, 'toN', 0, 0], [570, 49, 1140, 21, 17, 'k11', 0, 0], [643, 51, 1139, 22, 17, 'pod', 0, 0], [715, 89, 1140, 21, 17, '6SQfv', 0, 0], [825, 83, 1139, 22, 17, 'CzC2M', 0, 0], [934, 102, 1140, 21, 17, 'aowjQC', 0, 0], [1062, 51, 1140, 21, 17, 'BtC', 0, 0], [1558, 136, 1142, 20, 17, 'XhJ', 0, 0], [1722, 336, 1115, 25, 16, 'OgtXP2nxOwP7Gb3I', 0, 0], [352, 125, 1174, 21, 18, 'zYmvutc', 0, 0], [498, 45, 1174, 21, 18, 'JvN', 0, 0], [570, 124, 1174, 21, 18, 'TyZdJG4', 0, 0], [352, 64, 1207, 22, 19, 'Lvam', 0, 0], [443, 45, 1208, 21, 19, 'Onk', 0, 0], [516, 123, 1208, 21, 19, 'bPgi7tF', 0, 0], [1946, 12, 1231, 11, 20, 'I', 0, 0], [351, 106, 1241, 23, 20, 'xbAa7n', 0, 0], [479, 306, 1242, 22, 20, 'NEn7uifO17vkyzVVp', 0, 0], [1300, 142, 1242, 27, 20, 'dZukserV', 0, 0], [352, 178, 1275, 34, 21, 'qrxWKyJjjn', 0, 0], [557, 60, 1275, 28, 21, '2Ri5', 0, 0], [1354, 88, 1276, 27, 21, 'ZCp3F', 0, 0], [1558, 197, 1231, 63, 20, 'YgoGs', 0, 0], [1787, 96, 1247, 63, 20, 'Um', 0, 0], [1913, 268, 1231, 63, 20, 'YL7fkaV', 0, 0], [351, 70, 1309, 23, 22, 'kcGD', 0, 0], [443, 142, 1309, 23, 22, 'lGAx6Ljx', 0, 0], [605, 35, 1310, 21, 22, 'Hm', 0, 0], [661, 142, 1310, 27, 22, 'S8gZ5tPE', 0, 0], [1302, 135, 1310, 27, 22, 'gjgVPImz', 0, 0], [1743, 12, 1329, 11, 23, 'Z', 0, 0], [2055, 16, 1324, 17, 23, 'i', 0, 0], [353, 11, 1344, 21, 24, 'L', 0, 0], [386, 53, 1344, 21, 24, 'Q5J', 0, 0], [1300, 142, 1344, 27, 24, '9L9ScEj2', 0, 0], [1558, 400, 1345, 63, 24, 'S8YyUDnXd', 0, 0], [1993, 91, 1345, 62, 24, '4P', 0, 0], [1555, 102, 1605, 35, 25, 'kbGP', 0, 2], [1674, 371, 1605, 44, 25, 'DO1tvoEyiX9AVz6Q', 0, 2], [2062, 147, 1605, 44, 25, 'DtQAa3', 2, 2], [1554, 53, 1669, 35, 26, 'pg', 0, 2], [1624, 104, 1660, 34, 26, 'ZPsJ', 0, 2], [1746, 221, 1659, 38, 26, '7CBPYAUA', 0, 2], [1987, 50, 1657, 46, 26, 'AL', 0, 2], [1555, 407, 1714, 44, 27, 'LA3ShdHUE3DAoOkfiB', 0, 2], [188, 1826, 2340, 3, 29, '4', 0, 0], [2024, 217, 2309, 34, 28, 'DLpZXhKepjdcyW', 0, 0], [2239, 119, 2310, 33, 28, '28otEfj9', 0, 0], [230, 77, 2349, 23, 29, 'Th1YC4R', 0, 0], [476, 89, 2349, 18, 29, 'uFRt5qEx', 0, 0], [1140, 463, 2388, 35, 30, 'Mxcsoj1MOubuEB33', 0, 0], [1708, 40, 2372, 17, 30, 'OfA', 0, 9], [1758, 81, 2372, 22, 30, 'ZQoO7mwr', 0, 9], [1848, 3, 2372, 17, 30, 'M', 0, 9], [1860, 134, 2372, 22, 30, 'IvtUnQ4Zxc29A', 0, 9], [2002, 20, 2376, 13, 30, '3V', 0, 9], [2029, 32, 2372, 17, 30, '6t8', 0, 9], [2070, 133, 2372, 17, 30, 'PdCWscuWGHR', 0, 9], [1709, 171, 2398, 22, 30, 'RsW4Oj1Lhf1ljQV4G', 0, 9], [1890, 148, 2398, 22, 30, 'VSUJUa3tuYIhiXxP', 9, 9], [2048, 34, 2398, 17, 30, 'aAm', 0, 9], [2089, 21, 2403, 12, 30, 'uY', 0, 9], [2118, 53, 2398, 17, 30, '6DDFv', 0, 9], [2179, 28, 2398, 17, 30, 'DKJ', 0, 9], [2214, 66, 2398, 17, 30, 'NBmY9BD', 0, 9], [2289, 57, 2398, 18, 30, 'sYsrT', 0, 9], [1708, 25, 2425, 17, 31, 'jGk', 0, 9], [1736, 34, 2429, 13, 31, 'oX', 0, 9], [1778, 93, 2425, 17, 31, 'OvpfEyhHso', 0, 9], [120, 131, 2510, 23, 32, 'rZCsYsA6im2b', 0, 0], [260, 25, 2515, 18, 32, 'G6', 0, 0], [295, 107, 2510, 18, 32, 'd6eYwhzZuS', 0, 0], [132, 88, 2582, 22, 34, 'Xc84', 3, 3], [231, 223, 2582, 22, 34, 'MnMcBUHVmhl2', 0, 3], [463, 47, 2582, 22, 34, 'Vto', 0, 3], [132, 194, 2616, 22, 35, 'B4f1f4KpCHC', 0, 3], [338, 14, 2616, 22, 35, 'W', 0, 3], [131, 64, 2650, 22, 36, 'UW6t', 0, 3], [216, 181, 2650, 22, 36, 'hLULWi7xdj', 0, 3], [1044, 175, 2510, 18, 32, 'F9f7jvsfmjnXbK', 0, 0], [1226, 25, 2515, 18, 32, 'Vk', 0, 0], [1261, 177, 2510, 23, 32, 'TBlYLSoItzHKpG', 0, 0], [1054, 132, 2544, 22, 33, 'u4vvPgHd', 0, 0], [1053, 36, 2590, 21, 34, 'lN', 0, 4], [1101, 107, 2589, 23, 34, 'ieee4D', 0, 4], [1218, 47, 2589, 23, 34, 'kD6', 0, 4], [1054, 122, 2623, 23, 35, 'Ngf2xWa', 0, 4], [1189, 132, 2624, 22, 35, 'N27RyHsP', 0, 4], [1054, 204, 2657, 23, 36, 'e97JFxWTXfS', 0, 4], [1262, 43, 2658, 22, 36, 'p', 4, 4], [1054, 65, 2692, 22, 37, 'mle1', 0, 4], [1139, 186, 2691, 23, 37, 'o6tA5wFrK', 0, 4], [1337, 39, 2691, 23, 37, 'W3', 0, 4], [1709, 175, 2510, 18, 32, 'DQm27gIhcjmkdB', 0, 0], [1892, 25, 2515, 18, 32, '4Z', 0, 0], [1927, 176, 2510, 23, 32, 'rAP1PxzMyqkxdY', 0, 0], [1720, 132, 2544, 22, 33, 'JpsQeikW', 0, 0], [1719, 35, 2590, 21, 34, 'hD', 0, 5], [1766, 107, 2589, 23, 34, '3vzIwR', 0, 5], [1884, 47, 2589, 23, 34, 'kHw', 0, 5], [1720, 122, 2623, 23, 35, 'MYOKedL', 0, 5], [1854, 132, 2624, 22, 35, 'K8JXFVII', 5, 5], [1720, 204, 2657, 23, 36, 'bBkPRmgyfVp', 0, 5], [1928, 43, 2658, 22, 36, 'j', 0, 5], [1719, 65, 2692, 22, 37, 'RfU4', 0, 5], [1805, 185, 2691, 23, 37, 'wtK1L23Q4', 0, 5], [2003, 38, 2692, 22, 37, 'yY', 0, 5], [130, 255, 2804, 23, 38, 'jgoGjNh2DoLnb2b4PGonGvU', 0, 0], [1044, 117, 2804, 18, 38, 'qGXS7f7gRHy', 0, 0], [1168, 38, 2804, 18, 38, 'UQI', 0, 0], [1215, 102, 2804, 18, 38, 'P764bscKkx', 0, 0], [1320, 38, 2804, 18, 38, 'OtH', 0, 0], [1368, 58, 2804, 18, 38, 'VhrUJ', 0, 0], [1709, 100, 2804, 23, 38, 'zjQgoufCGU', 0, 0], [131, 55, 2852, 21, 40, 'piH', 0, 0], [198, 41, 2858, 15, 40, 'wU6P', 0, 0], [281, 124, 2852, 21, 40, 'riQCT4RX', 0, 0], [454, 138, 2852, 27, 40, 'jSAJPlWhyRE', 0, 0], [612, 77, 2852, 21, 40, 'nVS97', 0, 0], [131, 227, 2886, 21, 41, 'zExU7Poi4QW', 0, 0], [375, 235, 2886, 21, 41, 'pLTfHVP1qzb7Mh2', 0, 0], [138, 100, 2957, 15, 42, 'fv8', 0, 0], [1404, 4, 2978, 4, 42, 'B', 0, 0], [130, 103, 2975, 34, 42, 'qpg', 0, 0], [253, 252, 2974, 19, 42, 'T9SOmYWl4CUrdt8o', 0, 0], [1078, 3, 2972, 40, 42, 'S5', 0, 0], [1103, 62, 2978, 28, 42, 'L6W', 0, 0], [1181, 56, 2978, 28, 42, 'ep1', 0, 0], [1253, 118, 2978, 28, 42, 'oKhrqlI', 0, 0], [1384, 45, 2985, 21, 42, 'OyP', 0, 0], [1444, 132, 2978, 28, 42, 'mvg8Bw5', 0, 0], [1593, 55, 2972, 76, 42, 'eG', 0, 0], [218, 5, 3074, 18, 44, 'z', 0, 0], [231, 72, 3058, 18, 44, 'x1Pat7', 0, 0], [605, 5, 3074, 18, 44, 'P', 0, 0], [617, 39, 3058, 18, 44, 'dNT', 0, 0], [1053, 146, 3058, 23, 44, 'q7CLeOJhnI1oa', 0, 0], [1802, 5, 3074, 18, 44, '6', 0, 0], [1815, 72, 3058, 18, 44, 'acKa9h', 0, 0], [2119, 50, 3057, 35, 44, 'uGH', 0, 0], [461, 129, 3125, 29, 45, 'p6L5U', 0, 0], [623, 44, 3125, 29, 45, 'dC', 0, 0], [1046, 266, 3125, 29, 45, '9HBoqUyRbg', 0, 0], [1975, 129, 3125, 29, 45, 'qH1ph', 0, 0], [2136, 45, 3125, 29, 45, 'gG', 0, 0], [218, 5, 3183, 20, 46, 'j', 0, 0], [605, 5, 3183, 20, 46, 'o', 0, 0], [119, 24, 3213, 18, 47, 'QDN', 0, 8], [153, 94, 3213, 18, 47, 'EleVpvP4', 0, 8], [256, 105, 3213, 23, 47, 'dq9L2xQO7', 0, 8], [370, 7, 3223, 2, 47, 'n', 0, 8], [386, 69, 3212, 24, 47, 'L9EKl', 0, 8], [464, 83, 3213, 23, 47, 'AnF2rBIN', 0, 8], [555, 19, 3214, 17, 47, 'k6', 0, 8], [582, 62, 3213, 18, 47, 'y3M3kx', 8, 8], [654, 2, 3213, 18, 47, '1', 0, 8], [666, 139, 3212, 19, 47, 'SkmavPFrrrSv', 0, 8], [808, 52, 3213, 18, 47, 'bJ5S', 0, 8], [200, 100, 3316, 29, 50, 'NmNa', 0, 7], [336, 675, 3316, 29, 50, 'vB759g8XWkL7XXe5tCHZs7tAF', 7, 7], [1046, 42, 3203, 23, 47, 'v4T', 0, 0], [1095, 150, 3202, 19, 47, 'NH7vM6', 0, 0], [1251, 24, 3199, 22, 47, '47', 0, 0], [1802, 5, 3183, 20, 46, 'B', 0, 0], [2119, 5, 3183, 20, 46, 'b', 0, 0], [1714, 254, 3213, 23, 47, '2Za9eGyQyKp4S2rVYahzJNM', 0, 0], [1715, 55, 3261, 21, 48, 'djv', 0, 6], [1781, 41, 3267, 15, 48, '3WHD', 0, 6], [1864, 124, 3261, 21, 48, '8ucAV2oj', 0, 6], [2037, 139, 3261, 27, 48, 'baUoLawp6rY', 0, 6], [2196, 76, 3261, 21, 48, 'sRheu', 6, 6], [1715, 226, 3295, 21, 49, 'hAfhkKsI7Jx', 0, 6], [1959, 234, 3295, 21, 49, 'quecbSW4gEdjSGG', 0, 6], [1715, 176, 3329, 27, 50, 'ciaZR8NxiuEXr1', 0, 6], [1910, 140, 3329, 21, 50, 'vicUyHPNcN', 0, 6]]
data_pd =  pd.DataFrame(data_array,columns=["left", "width", "top", "height", "lineNr", "randomWord", "keyWord", "keyWordGroup"])


### group words to keywords
from scipy.cluster.vq import vq
keywords = pd.concat([data_pd[data_pd.keyWord!=0].left + data_pd[data_pd.keyWord!=0].width/2, data_pd[data_pd.keyWord!=0].top + data_pd[data_pd.keyWord!=0].height/2], axis=1)
words    = pd.concat([data_pd[data_pd.keyWord==0].left + data_pd[data_pd.keyWord==0].width/2, data_pd[data_pd.keyWord==0].top + data_pd[data_pd.keyWord==0].height/2], axis=1)
res = vq(words.to_numpy(),keywords.to_numpy())

### remove outliers
import numpy as np
factor = 1.5
limits = []
# calculate limit as limit = mean + factor * stddev for each keyWord
for i in range(len(keywords)):
    limits.append(np.mean(res[1][res[0]==i]) + factor * np.std(res[1][res[0]==i]))

# mark words with distance > limit as outliers
for i in range(len(res[0])):
    if res[1][i] > limits[res[0][i]]:
        res[0][i] = -1

### assign results to dataframe
words['keyWordGroupNew'] = res[0] + 1
keywords['keyWordGroupNew'] = range(1, len(keywords) + 1)
data_pd = pd.concat([data_pd, pd.concat([words['keyWordGroupNew'],keywords['keyWordGroupNew']])], axis=1, join='outer')

# renumber keyWordGroup according to keyWord numbering
dic = dict(zip(range(1, len(keywords) + 1), data_pd[data_pd.keyWord!=0]['keyWord']))
dic[0] = 0
data_pd.keyWordGroupNew = data_pd.keyWordGroupNew.map(dic)


from PIL import Image, ImageFont, ImageDraw # pip install Pillow
import random

#Create a empty image object
new_im = Image.new('RGB', ((data_pd["left"]+data_pd["width"]).max() + data_pd["left"].min(), (data_pd["top"]+data_pd["height"]).max()  + data_pd["top"].min() ), (255,255,255))
draw_new_im = ImageDraw.Draw(new_im)

#Create a dictioanry with random colors to assign for each uniq keyWordGroupNew
uniqGroups = data_pd["keyWordGroupNew"].unique()
colors = {}
i = 0
for g in uniqGroups:
    if(g == 0):
        colors[str(g)] = "black" # assign black color for non groups
    else:
        colors[str(g)] =  "hsl(" + str(70 + i * 290 / (len(uniqGroups) - 2)) + ",100%,50%)"
        i += 1

#Write text to the image
for i, row in data_pd.iterrows():
    draw_new_im.text((int(row["left"]), int(row["top"])), str(row["randomWord"]) ,  fill=colors[str(row["keyWordGroupNew"])],font=ImageFont.truetype("arial.ttf", int(row["height"])))
    if row["keyWord"] > 0:
        draw_new_im.rectangle([row["left"] ,row["top"], row["left"]+row["width"], row["top"]+row["height"]], outline=colors[str(row["keyWordGroupNew"])])

#Save the image
new_im.save("out-std.jpg")

正如您在代码中看到的,我还对图像生成做了两个小改进:将颜色均匀分布在从黄色到红色的色调范围内,并在关键字周围绘制边框。

另一种异常检测方法称为local outlier factor。这种技术将孤立词标记为离群词,而不被其他组成员包围。你知道吗

### group words to keywords
from scipy.cluster.vq import vq
keywords = pd.concat([data_pd[data_pd.keyWord!=0].left + data_pd[data_pd.keyWord!=0].width/2, data_pd[data_pd.keyWord!=0].top + data_pd[data_pd.keyWord!=0].height/2], axis=1)
words    = pd.concat([data_pd[data_pd.keyWord==0].left + data_pd[data_pd.keyWord==0].width/2, data_pd[data_pd.keyWord==0].top + data_pd[data_pd.keyWord==0].height/2], axis=1)
res = vq(words.to_numpy(),keywords.to_numpy())

# assign results
words['keyWordGroupNew'] = res[0] + 1
keywords['keyWordGroupNew'] = range(1, len(keywords) + 1)

### remove outliers
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
for i in range(1, len(keywords)+1):
    y_pred = clf.fit_predict(words[words.keyWordGroupNew==i].iloc[:,0:2].to_numpy())
    words.loc[words[words.keyWordGroupNew==i][y_pred==-1].index,'keyWordGroupNew'] = 0   # mark as outlier

### save results to dataframe
data_pd = pd.concat([data_pd, pd.concat([words['keyWordGroupNew'],keywords['keyWordGroupNew']])], axis=1, join='outer')

# renumber keyWordGroup according to keyWord numbering
dic = dict(zip(range(1, len(keywords) + 1), data_pd[data_pd.keyWord!=0]['keyWord']))
dic[0] = 0
data_pd.keyWordGroupNew = data_pd.keyWordGroupNew.map(dic)

# image generation as in previous example ...

这对于相对较小的组来说效果不太好,当关键字位于组的中心之外时,结果在视觉上也不如其他方法好。 污染物=0.1的结果是here,这是一个常用值。有关详细信息,请参见original papersklearn docs。你知道吗

结论:两种方法均能得到满意的结果,可分别通过调整因子x和污染程度来调整。你知道吗

相关问题 更多 >