Revision of ocr_eval_engine.py

akadmin revised this gist 5 months ago. Go to revision

2 files changed, 84 insertions

ocr_eval_engine.py(file created)

		@@ -0,0 +1,77 @@
1	+	# ocr_eval_engine.py
2	+
3	+	import jiwer
4	+	from fuzzywuzzy import fuzz
5	+
6	+
7	+	class OCREvaluator:
8	+	"""
9	+	정답(GT) 텍스트와 하나 이상의 예측(Hypothesis) 텍스트를 비교하여
10	+	다양한 문자 오류율(CER) 지표를 계산하는 클래스.
11	+	"""
12	+
13	+	def __init__(self, ground_truth_text: str):
14	+	"""
15	+	평가기 인스턴스를 초기화합니다.
16	+
17	+	:param ground_truth_text: 비교의 기준이 되는 정답 텍스트.
18	+	"""
19	+	# 모든 텍스트는 유니코드(UTF-8)로 처리됩니다.
20	+	self.ground_truth = ground_truth_text
21	+
22	+	def evaluate(self, hypothesis_text: str) -> dict:
23	+	"""
24	+	주어진 예측 텍스트에 대한 모든 평가 지표를 계산합니다.
25	+
26	+	:param hypothesis_text: 평가할 OCR 예측 텍스트.
27	+	:return: 평가 결과를 담은 딕셔너리.
28	+	"""
29	+	strict_results = self._calculate_strict_cer(self.ground_truth, hypothesis_text)
30	+	flexible_cer = self._calculate_flexible_cer(self.ground_truth, hypothesis_text)
31	+
32	+	results = {
33	+	"strict_cer": strict_results["cer"],
34	+	"substitutions": strict_results["S"],
35	+	"deletions": strict_results["D"],
36	+	"insertions": strict_results["I"],
37	+	"hits": strict_results["H"],
38	+	"flexible_cer": flexible_cer,
39	+	}
40	+	return results
41	+
42	+	def _calculate_strict_cer(self, ref: str, hyp: str) -> dict:
43	+	"""
44	+	jiwer를 사용하여 엄격한 순서의 CER을 계산합니다.
45	+	이 메서드는 레벤슈타인 거리를 기반으로 S, D, I를 계산합니다.
46	+
47	+	:param ref: 정답 텍스트.
48	+	:param hyp: 예측 텍스트.
49	+	:return: CER, S, D, I, H(정답) 개수를 포함하는 딕셔너리.
50	+	"""
51	+	if not ref: # 정답 텍스트가 비어있는 경우
52	+	return {"cer": 1.0 if hyp else 0.0, "S": 0, "D": 0, "I": len(hyp), "H": 0}
53	+
54	+	# jiwer.process_characters는 상세한 오류 분석 결과를 제공합니다.
55	+	output = jiwer.process_characters(ref, hyp)
56	+	return {
57	+	"cer": output.cer,
58	+	"S": output.substitutions,
59	+	"D": output.deletions,
60	+	"I": output.insertions,
61	+	"H": output.hits,
62	+	}
63	+
64	+	def _calculate_flexible_cer(self, ref: str, hyp: str) -> float:
65	+	"""
66	+	fuzzywuzzy의 token_sort_ratio를 사용하여 순서에 유연한 CER을 계산합니다.
67	+	이 메서드는 문자 순서를 무시하고 내용의 유사성을 평가합니다.
68	+
69	+	:param ref: 정답 텍스트.
70	+	:param hyp: 예측 텍스트.
71	+	:return: 순서 유연 CER (0.0에서 1.0 사이의 값).
72	+	"""
73	+	# token_sort_ratio는 0-100 사이의 유사도 점수를 반환합니다.
74	+	# 이를 0-1 사이의 오류율로 변환합니다.
75	+	similarity_ratio = fuzz.token_sort_ratio(ref, hyp)
76	+	error_rate = (100 - similarity_ratio) / 100.0
77	+	return error_rate

requirements.txt(file created)

		@@ -0,0 +1,7 @@
1	+	fuzzywuzzy==0.18.0
2	+	jiwer==4.0.0
3	+	levenshtein==0.27.1
4	+	markupsafe==3.0.2
5	+	python-levenshtein==0.27.1
6	+	rapidfuzz==3.13.0
7	+	werkzeug==3.1.3

Newer Older