anz2 commited on
Commit
48fef9e
·
1 Parent(s): 85b9718

install evaluation package from git separately. remove packages that were displaced on separete repository.

Browse files
evaluation/iou.py DELETED
@@ -1,370 +0,0 @@
1
- # https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
2
-
3
- import numpy as np
4
- import pandas as pd
5
- from scipy.sparse import csr_matrix
6
- from scipy.sparse.csgraph import connected_components
7
-
8
-
9
- def bb_intersection_over_union(boxA, boxB):
10
- EPS = 1e-5
11
- # determine the (x, y)-coordinates of the intersection rectangle
12
- xA = max(boxA[0], boxB[0])
13
- yA = max(boxA[1], boxB[1])
14
- xB = min(boxA[2], boxB[2])
15
- yB = min(boxA[3], boxB[3])
16
- # compute the area of intersection rectangle
17
- interArea = max(0, xB - xA + EPS) * max(0, yB - yA + EPS)
18
- # compute the area of both the prediction and ground-truth
19
- # rectangles
20
- boxAArea = (boxA[2] - boxA[0] + EPS) * (boxA[3] - boxA[1] + EPS)
21
- boxBArea = (boxB[2] - boxB[0] + EPS) * (boxB[3] - boxB[1] + EPS)
22
- # compute the intersection over union by taking the intersection
23
- # area and dividing it by the sum of prediction + ground-truth
24
- # areas - the interesection area
25
- iou = interArea / float(boxAArea + boxBArea - interArea)
26
- # return the intersection over union value
27
- return iou
28
-
29
-
30
- def bb_intersection_over_union_vectorized(bboxes1, bboxes2):
31
- low = np.s_[..., :2]
32
- high = np.s_[..., 2:]
33
-
34
- EPS = 1e-5
35
-
36
- A, B = bboxes1.copy(), bboxes2.copy()
37
- A = np.tile(A, (1, len(bboxes2))).reshape(len(bboxes1) * len(bboxes2), -1)
38
- B = np.tile(B, (len(bboxes1), 1))
39
-
40
- A[high] += EPS
41
- B[high] += EPS
42
-
43
- intrs = (
44
- np.maximum(
45
- 0.0,
46
- np.minimum(
47
- A[high],
48
- B[high]
49
- )
50
- -
51
- np.maximum(
52
- A[low],
53
- B[low]
54
- )
55
- )
56
- ).prod(-1)
57
-
58
- ious = intrs / ((A[high] - A[low]).prod(-1) + (B[high] - B[low]).prod(-1) - intrs)
59
-
60
- return ious.reshape(len(bboxes1), len(bboxes2))
61
-
62
-
63
- def bb_is_on_same_line_vectorized(bboxes1, bboxes2):
64
- low = np.s_[..., 1]
65
- high = np.s_[..., 3]
66
-
67
- A, B = bboxes1.copy(), bboxes2.copy()
68
- A = np.tile(A, (1, len(bboxes2))).reshape(len(bboxes1) * len(bboxes2), -1)
69
- B = np.tile(B, (len(bboxes1), 1))
70
-
71
- is_on_same_line = np.bitwise_and(
72
- np.bitwise_and(A[low] <= (B[low] + B[high]) / 2, (B[low] + B[high]) / 2 <= A[high]),
73
- np.bitwise_and(B[low] <= (A[low] + A[high]) / 2, (A[low] + A[high]) / 2 <= B[high]),
74
- )
75
-
76
- return is_on_same_line.reshape(len(bboxes1), len(bboxes2))
77
-
78
-
79
- def iou(ocr1, ocr2):
80
- return bb_intersection_over_union(
81
- (ocr1['x1'], ocr1['y1'], ocr1['x2'], ocr1['y2']),
82
- (ocr2['x1'], ocr2['y1'], ocr2['x2'], ocr2['y2'])
83
- )
84
-
85
-
86
- def _generate_empty_row(example_row, index):
87
- """This will generate empty row with empty values but it also generates tiny but valid bounding box
88
- to avoid exceptions while cropping the image"""
89
-
90
- example_row_dict = example_row.to_dict()
91
- example_row_dict['page'] = example_row_dict.get('page', 0)
92
- example_row_dict['block'] = 0
93
- example_row_dict['paragraph'] = 0
94
- example_row_dict['word'] = 0
95
- example_row_dict['x1'] = 0
96
- example_row_dict['y1'] = 0
97
- example_row_dict['x2'] = 1
98
- example_row_dict['y2'] = 1
99
- example_row_dict['conf'] = 0.0
100
- example_row_dict['text'] = ""
101
-
102
- empty_row = pd.DataFrame([example_row_dict], columns=example_row.index, index=[index])
103
-
104
- return empty_row
105
-
106
-
107
- def word_or_symbol_pair_matching(df1, df2, pref1, pref2):
108
- """Applies IOU based matching of words or symbol elements using rectangular bounding boxes (x1,y1,x2,y2).
109
- It sorts makes sure that matching between first and second set is unique which means that it's not allowed to have
110
- one item in two different pairs. If pair isn't found then empty element is used as a pair. This way it's guaranteed
111
- that word or symbol level matching is correctly evaluated. Pairs are generated in decreasing order of IOU values.
112
- """
113
- # match word pairs by page
114
- text_pairs_dfs_per_page = []
115
- unique_page_ids = sorted(list(set(df1['page'].unique().tolist() + df2['page'].unique().tolist())))
116
-
117
- for page_id in unique_page_ids:
118
- # extract words for given page only
119
- df1_page = df1[df1.page == page_id]
120
- df2_page = df2[df2.page == page_id]
121
-
122
- if not df1_page.empty and not df1_page.empty:
123
-
124
- # calculate similarities
125
- similarity_metrics = calculate_ious_fast(ocr1_df=df1_page, ocr2_df=df2_page)
126
- similarities = []
127
- for idx1, index1 in enumerate(df1_page.index):
128
- for idx2, index2 in enumerate(df2_page.index):
129
- similarities.append((index1, index2, similarity_metrics[idx1, idx2]))
130
-
131
- # process pair similarities in decreasing order of similarity values
132
- sorted_similarities = sorted(similarities, key=lambda x: -x[2])
133
- paired_items_1 = set()
134
- paired_items_2 = set()
135
- pairs = []
136
- for idx1, idx2, similarity in sorted_similarities:
137
- if idx1 not in paired_items_1 and idx2 not in paired_items_2:
138
- if similarity > 0.0:
139
- paired_items_1.add(idx1)
140
- paired_items_2.add(idx2)
141
- pairs.append((idx1, idx2, similarity))
142
-
143
- # add items as empty pairs which weren't matched but index is considered across all pages to avoid collisions
144
- EMPTY_ITEM_INDEX = max(df1.shape[0], df2.shape[0]) + 100 + page_id
145
- for idx1, row1 in df1_page.iterrows():
146
- if idx1 not in paired_items_1:
147
- pairs.append((idx1, EMPTY_ITEM_INDEX, 0.0))
148
- for idx2, row2 in df2_page.iterrows():
149
- if idx2 not in paired_items_2:
150
- pairs.append((EMPTY_ITEM_INDEX, idx2, 0.0))
151
-
152
- # sort pairs according to df2 items original indices
153
- sorted_pairs = sorted(pairs, key=lambda x: (x[1], x[0]))
154
-
155
- # create row for empty items in each dataframe
156
- df1_page = pd.concat([df1_page, _generate_empty_row(example_row=df1_page.iloc[0], index=EMPTY_ITEM_INDEX)])
157
- df2_page = pd.concat([df2_page, _generate_empty_row(example_row=df2_page.iloc[0], index=EMPTY_ITEM_INDEX)])
158
-
159
- # generate pairs dataset
160
- text_pairs_df = pd.concat(
161
- [
162
- df1_page.loc[[item[0] for item in sorted_pairs], :].reset_index(drop=True).add_prefix(pref1),
163
- df2_page.loc[[item[1] for item in sorted_pairs], :].reset_index(drop=True).add_prefix(pref2),
164
- pd.DataFrame(
165
- data=[item[2] for item in sorted_pairs],
166
- columns=["iou"]
167
- )
168
- ],
169
- axis=1
170
- )
171
-
172
- text_pairs_dfs_per_page.append(text_pairs_df)
173
-
174
- all_text_pairs_df = pd.concat(text_pairs_dfs_per_page, axis=0)
175
-
176
- return all_text_pairs_df
177
-
178
-
179
- def word_or_symbol_group_pair_matching(df1, df2, pref1, pref2):
180
- """Applies IOU based matching of words or symbol elements groups using rectangular bounding boxes (x1,y1,x2,y2).
181
- It sorts makes sure that matching between first and second set is unique which means that it's not allowed to have
182
- one item in two different pairs. If pair isn't found then empty element is used as a pair. BUT the difference from
183
- non-group approach is that here it's possible to match group of words or symbols on each other. This way it's
184
- more guaranteed that OCR detected result is evaluated correctly.
185
-
186
- Example:
187
- Let's say we have 2 words: ["abc", "d"] and target has only one word: ["abcd"] then it's better to group first
188
- two words and match them with the one target word. This way we try to evaluate the overall text detection
189
- accuracy and not the actual symbol or word boundary detection.
190
-
191
- Note: the grouping operation will happen on one line to avoid unpredictable results if word bounding boxes on
192
- neighboring lines has some intersection.
193
- """
194
- # match word pairs by page
195
- text_pairs_dfs_per_page = []
196
- unique_page_ids = sorted(list(set(df1['page'].unique().tolist() + df2['page'].unique().tolist())))
197
-
198
- for page_id in unique_page_ids:
199
- # extract words for given page only
200
- df1_page = df1[df1.page == page_id]
201
- df2_page = df2[df2.page == page_id]
202
-
203
- if not df1_page.empty and not df1_page.empty:
204
- df1_page_groups, df2_page_groups = get_connected_components(ocr1_df=df1_page, ocr2_df=df2_page)
205
-
206
- # calculate similarities
207
- similarity_metrics = calculate_ious_fast(ocr1_df=df1_page_groups, ocr2_df=df2_page_groups)
208
- similarities = []
209
- for idx1, index1 in enumerate(df1_page_groups.index):
210
- for idx2, index2 in enumerate(df2_page_groups.index):
211
- similarities.append((index1, index2, similarity_metrics[idx1, idx2]))
212
-
213
- # process pair similarities in decreasing order of similarity values
214
- sorted_similarities = sorted(similarities, key=lambda x: -x[2])
215
- paired_items_1 = set()
216
- paired_items_2 = set()
217
- pairs = []
218
- for idx1, idx2, similarity in sorted_similarities:
219
- if idx1 not in paired_items_1 and idx2 not in paired_items_2:
220
- if similarity > 0.0:
221
- paired_items_1.add(idx1)
222
- paired_items_2.add(idx2)
223
- pairs.append((idx1, idx2, similarity))
224
-
225
- # add items as empty pairs which weren't matched but index is considered across all pages to avoid collisions
226
- EMPTY_ITEM_INDEX = max(df1.shape[0], df2.shape[0]) + 100 + page_id
227
- for idx1, row1 in df1_page_groups.iterrows():
228
- if idx1 not in paired_items_1:
229
- pairs.append((idx1, EMPTY_ITEM_INDEX, 0.0))
230
- for idx2, row2 in df2_page_groups.iterrows():
231
- if idx2 not in paired_items_2:
232
- pairs.append((EMPTY_ITEM_INDEX, idx2, 0.0))
233
-
234
- # sort pairs according to df2 items original indices
235
- sorted_pairs = sorted(pairs, key=lambda x: (x[1], x[0]))
236
-
237
- # create row for empty items in each dataframe
238
- df1_page_groups = pd.concat(
239
- [df1_page_groups, _generate_empty_row(example_row=df1_page_groups.iloc[0], index=EMPTY_ITEM_INDEX)])
240
- df2_page_groups = pd.concat(
241
- [df2_page_groups, _generate_empty_row(example_row=df2_page_groups.iloc[0], index=EMPTY_ITEM_INDEX)])
242
-
243
- # generate pairs dataset
244
- text_pairs_df = pd.concat(
245
- [
246
- df1_page_groups.loc[[item[0] for item in sorted_pairs], :].reset_index(drop=True).add_prefix(pref1),
247
- df2_page_groups.loc[[item[1] for item in sorted_pairs], :].reset_index(drop=True).add_prefix(pref2),
248
- pd.DataFrame(
249
- data=[item[2] for item in sorted_pairs],
250
- columns=["iou"]
251
- )
252
- ],
253
- axis=1
254
- )
255
-
256
- text_pairs_dfs_per_page.append(text_pairs_df)
257
-
258
- all_text_pairs_df = pd.concat(text_pairs_dfs_per_page, axis=0)
259
-
260
- return all_text_pairs_df
261
-
262
- def calculate_ious_fast(ocr1_df, ocr2_df):
263
- ious = None
264
- if not ocr1_df.empty and not ocr2_df.empty:
265
- bboxes1 = np.array(ocr1_df["bounding_box"].values.tolist())
266
- bboxes2 = np.array(ocr2_df["bounding_box"].values.tolist())
267
-
268
- if len(bboxes1) > 0 and len(bboxes2) > 0:
269
- ious = bb_intersection_over_union_vectorized(bboxes1=bboxes1, bboxes2=bboxes2)
270
-
271
- return ious
272
-
273
-
274
- def calculate_iosl_fast(ocr1_df, ocr2_df):
275
- iosls = None
276
- if not ocr1_df.empty and not ocr2_df.empty:
277
- bboxes1 = np.array(ocr1_df["bounding_box"].values.tolist())
278
- bboxes2 = np.array(ocr2_df["bounding_box"].values.tolist())
279
-
280
- if len(bboxes1) > 0 and len(bboxes2) > 0:
281
- iosls = bb_is_on_same_line_vectorized(bboxes1=bboxes1, bboxes2=bboxes2)
282
-
283
- return iosls
284
-
285
-
286
- def calculate_adjacency_matrix(ocr1_df, ocr2_df):
287
- """Calculates Adjacency Matrix based on IOU values and for two different sets of items. For each item the adjacency
288
- is defined by the maximum IOU value. We do 2 sided approach since it can be the case that i is adjacent to j but j
289
- isn't adjacent to i, so we generate adjacency matrix for directed graph"""
290
- # concat both dataframes
291
- ocr_df = pd.concat([ocr1_df, ocr2_df], axis=0).reset_index()
292
-
293
- # calculate ious
294
- ious = calculate_ious_fast(ocr1_df=ocr_df, ocr2_df=ocr_df)
295
-
296
- # calculate `is on same line` property
297
- iosls = calculate_iosl_fast(ocr1_df=ocr_df, ocr2_df=ocr_df)
298
-
299
- # build adjacency matrix (1s and 0s)
300
- adjacency_matrix = np.bitwise_and(ious > 0.0, iosls).astype(np.int)
301
-
302
- return adjacency_matrix
303
-
304
-
305
- def get_connected_components(ocr1_df, ocr2_df):
306
- """Apply connected component analysis and group items"""
307
-
308
- def _aggregate_group_items_into_one(df):
309
- if len(df) == 1:
310
- return df
311
- else:
312
- _df = df.iloc[0, :]
313
- _bboxes = np.array(df["bounding_box"].values.tolist())
314
-
315
-
316
- _df["bounding_box"] = [
317
- [
318
- np.min(_bboxes[:, 0]),
319
- np.min(_bboxes[:, 1]),
320
- np.max(_bboxes[:, 2]),
321
- np.max(_bboxes[:, 3]),
322
- ]
323
- ]
324
- _df["confidence"] = df["confidence"].mean()
325
- _df["text"] = " ".join(df["text"].tolist())
326
-
327
- return _df
328
-
329
- # 1. calculate adjacency matrix
330
- adjacency_matrix = calculate_adjacency_matrix(ocr1_df=ocr1_df, ocr2_df=ocr2_df)
331
-
332
- # 2. find connected components
333
- n_components, labels = connected_components(csgraph=csr_matrix(adjacency_matrix), directed=False,
334
- return_labels=True)
335
-
336
- # 3. separate df1 and df2 items and group for each connected component
337
- connected_component_groups = pd.Series(labels).to_frame().groupby(0).apply(
338
- lambda x: {1: [item for item in x.index.tolist() if item < ocr1_df.shape[0]],
339
- 2: [item - len(ocr1_df) for item in x.index.tolist() if item >= ocr1_df.shape[0]]}).to_dict()
340
-
341
- # 4. check if group of items are consecutive (Optional but interesting)
342
- # assert np.all(pd.DataFrame(connected_component_groups).loc[1, :].apply(
343
- # lambda x: sum(x) == (min(x) * 2 + (len(x) - 1)) * len(x) / 2 if x else True))
344
- # assert np.all(pd.DataFrame(connected_component_groups).loc[2, :].apply(
345
- # lambda x: sum(x) == (min(x) * 2 + (len(x) - 1)) * len(x) / 2 if x else True))
346
-
347
- # 5. merge group items into one
348
- ocr1_df_groups = pd.concat(
349
- [
350
- _aggregate_group_items_into_one(
351
- ocr1_df.loc[group_data[1], :]
352
- )
353
- for group_id, group_data in connected_component_groups.items()
354
- if group_data[1]
355
- ],
356
- axis=0
357
- ).reset_index(drop=True)
358
-
359
- ocr2_df_groups = pd.concat(
360
- [
361
- _aggregate_group_items_into_one(
362
- ocr2_df.loc[group_data[2], :]
363
- )
364
- for group_id, group_data in connected_component_groups.items()
365
- if group_data[2]
366
- ],
367
- axis=0
368
- ).reset_index(drop=True)
369
-
370
- return ocr1_df_groups, ocr2_df_groups
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/metrics.py DELETED
@@ -1,589 +0,0 @@
1
- import pandas as pd
2
-
3
- from evaluation.iou import word_or_symbol_pair_matching, word_or_symbol_group_pair_matching
4
-
5
-
6
- def text_accuracy(df, pref_1, pref_2):
7
- return (df[f'{pref_1}text'] == df[f'{pref_2}text']).sum() / df.shape[0]
8
-
9
-
10
- def text_precision(df, pref_1, pref_2):
11
- ocr1_nonempty = df[f'{pref_1}text'].apply(lambda x: bool(x))
12
- ocr1 = df[f'{pref_1}text']
13
- ocr2 = df[f'{pref_2}text']
14
- return (ocr1_nonempty & (ocr1 == ocr2)).sum() / ocr1_nonempty.sum()
15
-
16
-
17
- def text_recall(df, pref_1, pref_2):
18
- ocr2_nonempty = df[f'{pref_2}text'].apply(lambda x: bool(x))
19
- ocr2 = df[f'{pref_1}text']
20
- ocr1 = df[f'{pref_2}text']
21
- return (ocr2_nonempty & (ocr2 == ocr1)).sum() / ocr2_nonempty.sum()
22
-
23
-
24
- def text_f1(df, pref_1, pref_2):
25
- precision = text_precision(df, pref_1, pref_2)
26
- recall = text_recall(df, pref_1, pref_2)
27
-
28
- if precision == 0 or recall == 0:
29
- f1 = 0.0
30
- else:
31
- f1 = (2 * precision * recall) / (precision + recall)
32
-
33
- return f1
34
-
35
-
36
- def symbol_confusion_matrix(df, pref_1, pref_2):
37
- all_symbols = list(sorted(set(df[f'{pref_1}text'].tolist() + df[f'{pref_2}text'].tolist())))
38
- pair_value_counts = df[
39
- [f'{pref_1}text', f'{pref_2}text']
40
- ].value_counts()
41
-
42
- pair_cnts = pair_value_counts.reset_index().rename({0: "count"}, axis=1).sort_values(
43
- by=[f'{pref_1}text', f'{pref_2}text'], ascending=True)
44
-
45
- pair_value_counts_dict = pair_value_counts.to_dict()
46
-
47
- confusion_matrix = pd.DataFrame(
48
- [
49
- [pair_value_counts_dict.get((symbol1, symbol2), 0) for symbol2 in all_symbols]
50
- for symbol1 in all_symbols
51
- ],
52
- columns=all_symbols,
53
- index=all_symbols,
54
- )
55
-
56
- return confusion_matrix, pair_cnts
57
-
58
-
59
- def levenstein(text1, text2):
60
- """Measures the metrics based on edit operations.
61
- - levenstein_distance: number of character operations (insertion, deletion, substitution) that
62
- required to get text2 from text1
63
- - levenstein_similarity: number of matches divided by the number of all operations (fraction of characters that
64
- don't require modification while transforming text1 into text2)
65
- - edit_operations: list of character operations (<operation name>, <text1 character>, <text2 character>)
66
- """
67
- levenstein_distance, edit_operations = edit_distance(text1, text2)
68
- if levenstein_distance == 0:
69
- levenstein_similarity = 1.0
70
- else:
71
- matches_cnt = len([item for item in edit_operations if item[0] == "match"])
72
- all_operations_cnt = len(edit_operations)
73
-
74
- if matches_cnt == 0:
75
- levenstein_similarity = 0.0
76
- else:
77
- levenstein_similarity = float(matches_cnt / all_operations_cnt)
78
-
79
- return levenstein_similarity, levenstein_distance, edit_operations
80
-
81
-
82
- def edit_distance(text1, text2):
83
- """
84
- we have three allowed edit operations:
85
- - Insert a character
86
- - Delete a character
87
- - Substitute a character
88
- Each of these operations has cost of 1
89
- Our goal is to minimize number of required operations to convert text1 into text2
90
- This DP problem which is being solved with 2d array (NxM) where N is the length of text1 and M - length of
91
- text2.
92
-
93
- DP[i][j]: this is minimum amount of operations to convert text1[:i] into text2[:j]
94
- The update rule is the following:
95
- DP[i][j] = min of the following
96
-
97
- case 1: DP[i-1][j-1] # match
98
- case 2: DP[i-1][j] + 1 # insertion,
99
- case 3: DP[i][j-1] + 1 # deletion
100
- case 4: DP[i-1][j-1] + 1 # substitution
101
-
102
- Example:
103
- text1 = "horse"
104
- text2 = "ros"
105
-
106
- DP _ r o s
107
- _ [0, 1, 2, 3]
108
- h [1, 1, 2, 3]
109
- o [2, 2, 1, 2]
110
- r [3, 2, 2, 2]
111
- s [4, 3, 3, 2]
112
- e [5, 4, 4, 3]
113
- """
114
- if not text1:
115
- return len(text2), []
116
- elif not text2:
117
- return len(text1), []
118
-
119
- INF = 10 ** 10
120
- N = len(text1)
121
- M = len(text2)
122
-
123
- DP = [[INF for _ in range(M + 1)] for _ in range(N + 1)]
124
- P = [[None for _ in range(M + 1)] for _ in range(N + 1)]
125
-
126
- for i in range(N + 1):
127
- DP[i][0] = i
128
- P[i][0] = "insertion"
129
- for j in range(M + 1):
130
- DP[0][j] = j
131
- P[0][j] = "deletion"
132
-
133
- for j in range(1, M + 1):
134
- for i in range(1, N + 1):
135
-
136
- pair_mismatch = int(text1[i - 1] != text2[j - 1])
137
- match_case = None
138
- match_cost = INF
139
-
140
- # match
141
- if match_cost > DP[i - 1][j - 1] + pair_mismatch:
142
- match_cost = DP[i - 1][j - 1] + pair_mismatch
143
- match_case = "substitution" if pair_mismatch == 1 else "match"
144
-
145
- # insertion
146
- if match_cost > DP[i - 1][j] + 1:
147
- match_cost = DP[i - 1][j] + 1
148
- match_case = "insertion"
149
-
150
- # deletion
151
- if match_cost > DP[i][j - 1] + 1:
152
- match_cost = DP[i][j - 1] + 1
153
- match_case = "deletion"
154
-
155
- DP[i][j] = match_cost
156
- P[i][j] = match_case
157
-
158
- operations = []
159
- i = N
160
- j = M
161
- while (i >= 0 and j >= 0) and not (i == 0 and j == 0):
162
- if P[i][j] == "substitution":
163
- operations.append(("substitution", text1[i - 1] if i - 1 >= 0 else "",
164
- text2[j - 1] if j - 1 >= 0 else "", i - 1, j - 1))
165
- i -= 1
166
- j -= 1
167
- elif P[i][j] == "match":
168
- operations.append(
169
- ("match", text1[i - 1] if i - 1 >= 0 else "", text2[j - 1] if j - 1 >= 0 else "", i - 1, j - 1))
170
- i -= 1
171
- j -= 1
172
- elif P[i][j] == "insertion":
173
- operations.append(("insertion", text1[i - 1] if i - 1 >= 0 else "",
174
- "", i - 1, j - 1))
175
- i -= 1
176
- elif P[i][j] == "deletion":
177
- operations.append(("deletion", "",
178
- text2[j - 1] if j - 1 >= 0 else "", i - 1, j - 1))
179
- j -= 1
180
-
181
- levenstein_distance = DP[N][M]
182
- operations = operations[::-1]
183
-
184
- return levenstein_distance, operations
185
-
186
-
187
- def levenstein_metrics(df, pref_1="Pred_", pref_2='Tar_'):
188
- levenstein_results = df[[f'{pref_1}text', f'{pref_2}text']].apply(
189
- lambda x: levenstein(text1=x[f'{pref_1}text'], text2=x[f'{pref_2}text']),
190
- axis=1
191
- )
192
- levenstein_similarities = levenstein_results.apply(lambda x: x[0])
193
- levenstein_distances = levenstein_results.apply(lambda x: x[1])
194
- edit_operations = levenstein_results.apply(lambda x: x[2])
195
-
196
- return levenstein_similarities, levenstein_distances, edit_operations
197
-
198
-
199
- def evaluate_by_words(pred_df, target_df, pred_pref='Pred_', target_pref='Target_', **kwargs):
200
- if not pred_df.empty and not target_df.empty:
201
-
202
- show_hist = kwargs.get("show_hist", False)
203
- text_pairs = word_or_symbol_pair_matching(df1=pred_df, df2=target_df, pref1=pred_pref, pref2=target_pref)
204
- levenstein_similarities, levenstein_distances, edit_operations = levenstein_metrics(
205
- df=text_pairs, pref_1=pred_pref, pref_2=target_pref
206
- )
207
-
208
- levenstein_similarities_stats = {
209
- **levenstein_similarities.describe().to_dict(),
210
- "values": levenstein_similarities.tolist()
211
- }
212
- levenstein_distances_stats = {
213
- **levenstein_distances.describe().to_dict(),
214
- "values": levenstein_distances.tolist()
215
- }
216
- iou_stats = {
217
- **text_pairs.iou.describe().to_dict(),
218
- "values": text_pairs.iou.tolist()
219
- }
220
- edit_operations_stats = {
221
- operation_id: pd.Series(
222
- edit_operations.apply(
223
- lambda x: [f"[{item[1]}]_[{item[2]}]" for item in x if item[0] == operation_id]
224
- ).sum(axis=0)).value_counts().to_dict()
225
- for operation_id in ["insertion", "deletion", "substitution"]
226
- }
227
-
228
- if show_hist is True:
229
- pd.Series(levenstein_similarities).plot(kind='hist', bins=20, title="Levestein Similarities")
230
- pd.Series(levenstein_distances).plot(kind='hist', bins=20, title="Levestein Distances")
231
- for edit_operation_id, edit_operation_data in edit_operations_stats.items():
232
- pd.Series(edit_operation_data).plot(kind='barh', title=f"{edit_operation_id.capitalize()} Stats")
233
-
234
- report = {
235
- "accuracy": text_accuracy(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
236
- "precision": text_precision(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
237
- "recall": text_recall(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
238
- "f1": text_f1(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
239
- "levenstein_distances_stats": levenstein_distances_stats,
240
- "levenstein_similarities_stats": levenstein_similarities_stats,
241
- "iou_stats": iou_stats,
242
- "edit_operations_stats": edit_operations_stats,
243
- }
244
- else:
245
- report = {
246
- "accuracy": None,
247
- "precision": None,
248
- "recall": None,
249
- "f1": None,
250
- "levenstein_distances_stats": {},
251
- "levenstein_similarities_stats": {},
252
- "iou_stats": {},
253
- "edit_operations_stats": {key: {} for key in ["insertion", "deletion", "substitution"]},
254
- }
255
-
256
- return report
257
-
258
-
259
- def evaluate_by_word_groups(pred_df, target_df, pred_pref='Pred_', target_pref='Target_', **kwargs):
260
- if not pred_df.empty and not target_df.empty:
261
-
262
- show_hist = kwargs.get("show_hist", False)
263
- text_pairs = word_or_symbol_group_pair_matching(df1=pred_df, df2=target_df, pref1=pred_pref, pref2=target_pref)
264
- levenstein_similarities, levenstein_distances, edit_operations = levenstein_metrics(
265
- df=text_pairs, pref_1=pred_pref, pref_2=target_pref
266
- )
267
-
268
- levenstein_similarities_stats = {
269
- **levenstein_similarities.describe().to_dict(),
270
- "values": levenstein_similarities.tolist()
271
- }
272
- levenstein_distances_stats = {
273
- **levenstein_distances.describe().to_dict(),
274
- "values": levenstein_distances.tolist()
275
- }
276
- iou_stats = {
277
- **text_pairs.iou.describe().to_dict(),
278
- "values": text_pairs.iou.tolist()
279
- }
280
- edit_operations_stats = {
281
- operation_id: pd.Series(
282
- edit_operations.apply(
283
- lambda x: [f"[{item[1]}]_[{item[2]}]" for item in x if item[0] == operation_id]
284
- ).sum(axis=0)).value_counts().to_dict()
285
- for operation_id in ["insertion", "deletion", "substitution"]
286
- }
287
-
288
- if show_hist is True:
289
- pd.Series(levenstein_similarities).plot(kind='hist', bins=20, title="Levestein Similarities")
290
- pd.Series(levenstein_distances).plot(kind='hist', bins=20, title="Levestein Distances")
291
- for edit_operation_id, edit_operation_data in edit_operations_stats.items():
292
- pd.Series(edit_operation_data).plot(kind='barh', title=f"{edit_operation_id.capitalize()} Stats")
293
-
294
- report = {
295
- "accuracy": text_accuracy(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
296
- "precision": text_precision(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
297
- "recall": text_recall(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
298
- "f1": text_f1(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
299
- "levenstein_distances_stats": levenstein_distances_stats,
300
- "levenstein_similarities_stats": levenstein_similarities_stats,
301
- "iou_stats": iou_stats,
302
- "edit_operations_stats": edit_operations_stats,
303
- }
304
- else:
305
- report = {
306
- "accuracy": None,
307
- "precision": None,
308
- "recall": None,
309
- "f1": None,
310
- "levenstein_distances_stats": {},
311
- "levenstein_similarities_stats": {},
312
- "iou_stats": {},
313
- "edit_operations_stats": {key: {} for key in ["insertion", "deletion", "substitution"]},
314
- }
315
-
316
- return report
317
-
318
-
319
- def reduce_word_evaluation_results(eval_results):
320
- if eval_results:
321
- accuracies = pd.Series([item['accuracy'] for item in eval_results])
322
- precisions = pd.Series([item['precision'] for item in eval_results])
323
- recalls = pd.Series([item['recall'] for item in eval_results])
324
- f1s = pd.Series([item['f1'] for item in eval_results])
325
- levenstein_similarities = pd.Series(
326
- [
327
- pd.Series(item['levenstein_similarities_stats'].get('values', [])).mean()
328
- for item in eval_results
329
- ]
330
- )
331
- levenstein_distances = pd.Series(
332
- [
333
- pd.Series(item['levenstein_distances_stats'].get('values', [])).mean()
334
- for item in eval_results
335
- ]
336
- )
337
- ious = pd.Series(
338
- [
339
- pd.Series(item['iou_stats'].get('values', [])).mean()
340
- for item in eval_results
341
- ]
342
- )
343
-
344
- levenstein_similarities_stats = {
345
- **levenstein_similarities.describe().to_dict(),
346
- "values": levenstein_similarities.tolist()
347
- }
348
- levenstein_distances_stats = {
349
- **levenstein_distances.describe().to_dict(),
350
- "values": levenstein_distances.tolist()
351
- }
352
- iou_stats = {
353
- **ious.describe().to_dict(),
354
- "values": ious.tolist()
355
- }
356
-
357
- edit_operations_stats = {}
358
- for eval_result in eval_results:
359
- for edit_operation, edit_operation_data in eval_result['edit_operations_stats'].items():
360
- if edit_operation not in edit_operations_stats:
361
- edit_operations_stats[edit_operation] = {}
362
-
363
- for key, count in edit_operation_data.items():
364
- edit_operations_stats[edit_operation][key] = edit_operations_stats[edit_operation].get(key,
365
- 0) + count
366
-
367
- summary = {
368
- "accuracy": {
369
- "mean": accuracies.mean(),
370
- "std": accuracies.std(),
371
- "values": accuracies.tolist()
372
- },
373
- "precision": {
374
- "mean": precisions.mean(),
375
- "std": precisions.std(),
376
- "values": precisions.tolist(),
377
- },
378
- "recall": {
379
- "mean": recalls.mean(),
380
- "std": recalls.std(),
381
- "values": recalls.tolist(),
382
- },
383
- "f1": {
384
- "mean": f1s.mean(),
385
- "std": f1s.std(),
386
- "values": f1s.tolist(),
387
- },
388
- "document_count": len(eval_results),
389
- "levenstein_distances_stats": levenstein_distances_stats,
390
- "levenstein_similarities_stats": levenstein_similarities_stats,
391
- "iou_stats": iou_stats,
392
- "edit_operations_stats": edit_operations_stats,
393
- }
394
-
395
-
396
- else:
397
- summary = {
398
- "accuracy": {},
399
- "precision": {},
400
- "recall": {},
401
- "f1": {},
402
- "document_count": 0,
403
- "levenstein_distances_stats": {},
404
- "levenstein_similarities_stats": {},
405
- "iou_stats": {},
406
- "edit_operations_stats": {key: {} for key in ["insertion", "deletion", "substitution"]},
407
- }
408
-
409
- return summary
410
-
411
-
412
- def evaluate_by_symbols(pred_df, target_df, pred_pref='Pred_', target_pref='Target_', **kwargs):
413
- if not pred_df.empty and not target_df.empty:
414
-
415
- show_hist = kwargs.get("show_hist", False)
416
- text_pairs = word_or_symbol_pair_matching(df1=pred_df, df2=target_df, pref1=pred_pref, pref2=target_pref)
417
-
418
- confusion_matrix, pair_counts = symbol_confusion_matrix(text_pairs, pref_1=pred_pref, pref_2=target_pref)
419
-
420
- iou_stats = {
421
- **text_pairs.iou.describe().to_dict(),
422
- "values": text_pairs.iou.tolist()
423
- }
424
-
425
- if show_hist is True:
426
- pd.Series(pair_counts).plot(kind='barh', title="Symbol Pair Counts")
427
-
428
- report = {
429
- "accuracy": text_accuracy(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
430
- "precision": text_precision(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
431
- "recall": text_recall(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
432
- "f1": text_f1(df=text_pairs, pref_1=pred_pref, pref_2=target_pref),
433
- "confusion_matrix": confusion_matrix,
434
- "pair_counts": pair_counts,
435
- "iou_stats": iou_stats,
436
- }
437
- else:
438
- report = {
439
- "accuracy": None,
440
- "precision": None,
441
- "recall": None,
442
- "f1": None,
443
- "confusion_matrix": pd.DataFrame(),
444
- "pair_counts": pd.DataFrame(),
445
- "iou_stats": {},
446
- }
447
-
448
- return report
449
-
450
-
451
- def reduce_pair_counts(pair_counts):
452
- reduced_pair_counts_df = pd.DataFrame()
453
- columns = []
454
- if pair_counts:
455
- pair_counts_dict = {}
456
- for pair_count in pair_counts:
457
- if not pair_count.empty:
458
- pair_count_dict = pair_count.set_index(pair_count.columns[:-1].tolist(), drop=True).to_dict()[
459
- pair_count.columns[-1]]
460
- columns = pair_count.columns.tolist()
461
- else:
462
- pair_count_dict = {}
463
-
464
- for key, value in pair_count_dict.items():
465
- pair_counts_dict[key] = pair_counts_dict.get(key, 0) + value
466
-
467
- reduced_pair_counts_df = pd.Series(pair_counts_dict).to_frame().reset_index()
468
- if columns:
469
- reduced_pair_counts_df.columns = columns
470
-
471
- return reduced_pair_counts_df
472
-
473
-
474
- def reduce_confusion_matrices(confusion_matrices):
475
- reduced_confusion_matrices_df = pd.DataFrame()
476
- if confusion_matrices:
477
- all_index_values = set()
478
- confusion_matrices_dict = {}
479
- for confusion_matrix in confusion_matrices:
480
- if not confusion_matrix.empty:
481
- confusion_matrix_dict = {
482
- (index, column): confusion_matrix.loc[index, column]
483
- for index in confusion_matrix.index
484
- for column in confusion_matrix.columns
485
- }
486
- else:
487
- confusion_matrix_dict = {}
488
-
489
- for key, value in confusion_matrix_dict.items():
490
- all_index_values.add(key[0])
491
- all_index_values.add(key[1])
492
- confusion_matrices_dict[key] = confusion_matrices_dict.get(key, 0) + value
493
-
494
- all_index_values = list(sorted(list(all_index_values)))
495
- reduced_confusion_matrices_df = pd.DataFrame(
496
- [
497
- [
498
- confusion_matrices_dict.get((index, column), 0)
499
- for column in all_index_values
500
- ]
501
- for index in all_index_values
502
- ],
503
- columns=all_index_values,
504
- index=all_index_values,
505
- )
506
-
507
- return reduced_confusion_matrices_df
508
-
509
-
510
- def reduce_symbol_evaluation_results(eval_results):
511
- """
512
- all_symbols = list(sorted(set(df[f'{pref_1}text'].tolist() + df[f'{pref_2}text'].tolist())))
513
- pair_value_counts = df[
514
- [f'{pref_1}text', f'{pref_2}text']
515
- ].value_counts()
516
-
517
- pair_cnts = pair_value_counts.reset_index().rename({0: "count"}, axis=1).sort_values(
518
- by=[f'{pref_1}text', f'{pref_2}text'], ascending=True)
519
-
520
- pair_value_counts_dict = pair_value_counts.to_dict()
521
-
522
- confusion_matrix = pd.DataFrame(
523
- [
524
- [pair_value_counts_dict.get((symbol1, symbol2), 0) for symbol2 in all_symbols]
525
- for symbol1 in all_symbols
526
- ],
527
- columns=all_symbols,
528
- index=all_symbols,
529
- )
530
- """
531
- if eval_results:
532
- accuracies = pd.Series([item['accuracy'] for item in eval_results])
533
- precisions = pd.Series([item['precision'] for item in eval_results])
534
- recalls = pd.Series([item['recall'] for item in eval_results])
535
- f1s = pd.Series([item['f1'] for item in eval_results])
536
- confusion_matrices = [item['confusion_matrix'] for item in eval_results]
537
- pair_counts = [item['pair_counts'] for item in eval_results]
538
- ious = pd.Series(
539
- [
540
- pd.Series(item['iou_stats'].get('values', [])).mean()
541
- for item in eval_results
542
- ]
543
- )
544
-
545
- iou_stats = {
546
- **ious.describe().to_dict(),
547
- "values": ious.tolist()
548
- }
549
-
550
- summary = {
551
- "accuracy": {
552
- "mean": accuracies.mean(),
553
- "std": accuracies.std(),
554
- "values": accuracies.tolist()
555
- },
556
- "precision": {
557
- "mean": precisions.mean(),
558
- "std": precisions.std(),
559
- "values": precisions.tolist(),
560
- },
561
- "recall": {
562
- "mean": recalls.mean(),
563
- "std": recalls.std(),
564
- "values": recalls.tolist(),
565
- },
566
- "f1": {
567
- "mean": f1s.mean(),
568
- "std": f1s.std(),
569
- "values": f1s.tolist(),
570
- },
571
- "document_count": len(eval_results),
572
- "pair_counts": reduce_pair_counts(pair_counts),
573
- "confusion_matrix": reduce_confusion_matrices(confusion_matrices),
574
- "iou_stats": iou_stats,
575
- }
576
-
577
- else:
578
- summary = {
579
- "accuracy": {},
580
- "precision": {},
581
- "recall": {},
582
- "f1": {},
583
- "document_count": 0,
584
- "pair_counts": pd.DataFrame(),
585
- "confusion_matrix": pd.DataFrame(),
586
- "iou_stats": {},
587
- }
588
-
589
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
iliauniiccocrevaluation.py CHANGED
@@ -17,8 +17,8 @@ import datasets
17
  import evaluate
18
 
19
  # TODO: Add BibTeX citation
20
- from evaluation.metrics import evaluate_by_words
21
- from ocr.fiftyone import FiftyOneOcr
22
 
23
  _CITATION = """\
24
  @InProceedings{huggingface:module,
 
17
  import evaluate
18
 
19
  # TODO: Add BibTeX citation
20
+ from ocr_evaluation.evaliate.metrics import evaluate_by_words
21
+ from ocr_evaluation.ocr.fiftyone import FiftyOneOcr
22
 
23
  _CITATION = """\
24
  @InProceedings{huggingface:module,
ocr/fiftyone.py DELETED
@@ -1,26 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
-
5
- class FiftyOneOcr:
6
- def __init__(self, data):
7
- self.data = data
8
-
9
- def get_word_annotations(self, convert_bbox: bool = True) -> pd.DataFrame:
10
- """Returns dataframe of detections where each row represents independent word annotation
11
-
12
- Args:
13
- convert_bbox: FiftyOne bounding box type (x1, x2, dx, xy) to 2 point bounding box type (x1, y1, x2, y2)
14
- """
15
-
16
- annotations = self.data.get("detections", {}).get("detections", {})
17
-
18
- annotations_df = pd.DataFrame(annotations)
19
-
20
- # convert bounding box into 2 point values format
21
- if convert_bbox:
22
- bbox = np.array(annotations_df['bounding_box'].values.tolist())
23
- bbox[:, 2:] += bbox[:, :2]
24
- annotations_df['bounding_box'] = bbox.tolist()
25
-
26
- return annotations_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- git+https://github.com/huggingface/evaluate@main
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ git+https://github.com/IliaUni-ICC/ocr_evaluation@main