OCR 분기 시점 - "Hanmac Gists"

OCR 분기 시점 · 2.4 KiB · Text Raw

force_ocr = False # OCR을 강제할지 여부 플래그 pdf_page_count = 0 # 3-1. PDF 텍스트 추출 시도 (PyMuPDF) try: print("Attempting text extraction from PDF...") with fitz.open(stream=contents, filetype="pdf") as doc: pdf_page_count = len(doc) if doc.needs_pass: print("PDF is password protected. Text extraction skipped, proceeding to OCR.") force_ocr = True # 비밀번호가 있으면 OCR 강제 else: all_text = "" for page_num in range(pdf_page_count): page = doc.load_page(page_num) page_text = page.get_text("text") all_text += page_text if page_num < pdf_page_count - 1: all_text += "\n--- Page Break ---\n" # 페이지 구분 추가 # 추출된 텍스트 유효성 검사 if all_text and not all_text.isspace(): print(f"Text extracted ({len(all_text)} chars). Validating content...") # 워터마크 제거 후 텍스트 확인 cleaned_text = watermark_regex.sub("", all_text).strip() # 페이지 구분자도 제거하고 길이 계산 cleaned_text_for_length = cleaned_text.replace("\n--- Page Break ---\n", "") # 1. 전체 텍스트 길이가 페이지 수 * 임계값보다 긴지 확인 # 2. 워터마크 제거 후에도 텍스트가 남아있는지 확인 min_total_length = MIN_MEANINGFUL_TEXT_LENGTH_PER_PAGE * pdf_page_count if len(cleaned_text_for_length) >= min_total_length and cleaned_text: print("Meaningful text found after validation.") extracted_text_from_pdf = all_text.strip() # 원본 추출 텍스트 저장 processing_method = "pdf_text_extraction" else: print(f"Extracted text seems insufficient or mostly watermarks (Cleaned length: {len(cleaned_text_for_length)}, Threshold: {min_total_length}). Forcing OCR.") force_ocr = True else: print("No text found via direct extraction. Proceeding to OCR.") force_ocr = True # 텍스트가 아예 없으면 OCR 강제 except Exception as e: print(f"Error during PDF text extraction: {e}. Proceeding to OCR.") force_ocr = True # 텍스트 추출 중 에러 발생 시 OCR 강제

1	force_ocr = False # OCR을 강제할지 여부 플래그
2	pdf_page_count = 0
3
4	# 3-1. PDF 텍스트 추출 시도 (PyMuPDF)
5	try:
6	print("Attempting text extraction from PDF...")
7	with fitz.open(stream=contents, filetype="pdf") as doc:
8	pdf_page_count = len(doc)
9	if doc.needs_pass:
10	print("PDF is password protected. Text extraction skipped, proceeding to OCR.")
11	force_ocr = True # 비밀번호가 있으면 OCR 강제
12	else:
13	all_text = ""
14	for page_num in range(pdf_page_count):
15	page = doc.load_page(page_num)
16	page_text = page.get_text("text")
17	all_text += page_text
18	if page_num < pdf_page_count - 1:
19	all_text += "\n--- Page Break ---\n" # 페이지 구분 추가
20
21	# 추출된 텍스트 유효성 검사
22	if all_text and not all_text.isspace():
23	print(f"Text extracted ({len(all_text)} chars). Validating content...")
24	# 워터마크 제거 후 텍스트 확인
25	cleaned_text = watermark_regex.sub("", all_text).strip()
26	# 페이지 구분자도 제거하고 길이 계산
27	cleaned_text_for_length = cleaned_text.replace("\n--- Page Break ---\n", "")
28
29	# 1. 전체 텍스트 길이가 페이지 수 * 임계값보다 긴지 확인
30	# 2. 워터마크 제거 후에도 텍스트가 남아있는지 확인
31	min_total_length = MIN_MEANINGFUL_TEXT_LENGTH_PER_PAGE * pdf_page_count
32	if len(cleaned_text_for_length) >= min_total_length and cleaned_text:
33	print("Meaningful text found after validation.")
34	extracted_text_from_pdf = all_text.strip() # 원본 추출 텍스트 저장
35	processing_method = "pdf_text_extraction"
36	else:
37	print(f"Extracted text seems insufficient or mostly watermarks (Cleaned length: {len(cleaned_text_for_length)}, Threshold: {min_total_length}). Forcing OCR.")
38	force_ocr = True
39	else:
40	print("No text found via direct extraction. Proceeding to OCR.")
41	force_ocr = True # 텍스트가 아예 없으면 OCR 강제
42
43	except Exception as e:
44	print(f"Error during PDF text extraction: {e}. Proceeding to OCR.")
45	force_ocr = True # 텍스트 추출 중 에러 발생 시 OCR 강제