This commit is contained in:
dfsoftai 2025-03-21 13:26:17 +08:00 committed by GitHub
commit 57aba5dd0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 29 additions and 0 deletions

View File

@ -7,6 +7,8 @@ from core.rag.extractor.blob.blob import Blob
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_storage import storage
import numpy as np
class PdfExtractor(BaseExtractor):
@ -53,16 +55,42 @@ class PdfExtractor(BaseExtractor):
def parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pypdfium2 # type: ignore
from rapidocr_onnxruntime import RapidOCR
ocr_engine = RapidOCR()
with blob.as_bytes_io() as file_path:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try:
for page_number, page in enumerate(pdf_reader):
# First attempt to directly extract the text
text_page = page.get_textpage()
content = text_page.get_text_range()
# If the extracted text content is very little or empty, use OCR
if not content or len(content.strip()) < 10:
try:
# convert to image
bitmap = page.render(scale=2.0)
pil_image = bitmap.to_pil()
img_array = np.array(pil_image)
result, _ = ocr_engine(img_array)
if result:
content = "\n".join([item[1] for item in result if item[1]])
else:
content = ""
except Exception as e:
print(f"OCR failed for page {page_number}: {str(e)}")
content = ""
finally:
if 'pil_image' in locals():
pil_image.close()
text_page.close()
page.close()
metadata = {"source": blob.source, "page": page_number}
yield Document(page_content=content, metadata=metadata)
except Exception as e:
print(f"Error processing PDF: {str(e)}")
raise
finally:
pdf_reader.close()

View File

@ -76,6 +76,7 @@ transformers = "~4.35.0"
unstructured = { version = "~0.16.1", extras = ["docx", "epub", "md", "msg", "ppt", "pptx"] }
validators = "0.21.0"
yarl = "~1.18.3"
rapidocr-onnxruntime = "~1.4.4"
# Before adding new dependency, consider place it in alphabet order (a-z) and suitable group.
############################################################