Merge 0dd634fa2c
into a30945312a
This commit is contained in:
commit
57aba5dd0d
@ -7,6 +7,8 @@ from core.rag.extractor.blob.blob import Blob
|
|||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
from extensions.ext_storage import storage
|
from extensions.ext_storage import storage
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PdfExtractor(BaseExtractor):
|
class PdfExtractor(BaseExtractor):
|
||||||
@ -53,16 +55,42 @@ class PdfExtractor(BaseExtractor):
|
|||||||
def parse(self, blob: Blob) -> Iterator[Document]:
|
def parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
"""Lazily parse the blob."""
|
"""Lazily parse the blob."""
|
||||||
import pypdfium2 # type: ignore
|
import pypdfium2 # type: ignore
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
|
||||||
|
ocr_engine = RapidOCR()
|
||||||
|
|
||||||
with blob.as_bytes_io() as file_path:
|
with blob.as_bytes_io() as file_path:
|
||||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||||
try:
|
try:
|
||||||
for page_number, page in enumerate(pdf_reader):
|
for page_number, page in enumerate(pdf_reader):
|
||||||
|
# First attempt to directly extract the text
|
||||||
text_page = page.get_textpage()
|
text_page = page.get_textpage()
|
||||||
content = text_page.get_text_range()
|
content = text_page.get_text_range()
|
||||||
|
|
||||||
|
# If the extracted text content is very little or empty, use OCR
|
||||||
|
if not content or len(content.strip()) < 10:
|
||||||
|
try:
|
||||||
|
# convert to image
|
||||||
|
bitmap = page.render(scale=2.0)
|
||||||
|
pil_image = bitmap.to_pil()
|
||||||
|
img_array = np.array(pil_image)
|
||||||
|
result, _ = ocr_engine(img_array)
|
||||||
|
if result:
|
||||||
|
content = "\n".join([item[1] for item in result if item[1]])
|
||||||
|
else:
|
||||||
|
content = ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"OCR failed for page {page_number}: {str(e)}")
|
||||||
|
content = ""
|
||||||
|
finally:
|
||||||
|
if 'pil_image' in locals():
|
||||||
|
pil_image.close()
|
||||||
text_page.close()
|
text_page.close()
|
||||||
page.close()
|
page.close()
|
||||||
metadata = {"source": blob.source, "page": page_number}
|
metadata = {"source": blob.source, "page": page_number}
|
||||||
yield Document(page_content=content, metadata=metadata)
|
yield Document(page_content=content, metadata=metadata)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing PDF: {str(e)}")
|
||||||
|
raise
|
||||||
finally:
|
finally:
|
||||||
pdf_reader.close()
|
pdf_reader.close()
|
||||||
|
@ -76,6 +76,7 @@ transformers = "~4.35.0"
|
|||||||
unstructured = { version = "~0.16.1", extras = ["docx", "epub", "md", "msg", "ppt", "pptx"] }
|
unstructured = { version = "~0.16.1", extras = ["docx", "epub", "md", "msg", "ppt", "pptx"] }
|
||||||
validators = "0.21.0"
|
validators = "0.21.0"
|
||||||
yarl = "~1.18.3"
|
yarl = "~1.18.3"
|
||||||
|
rapidocr-onnxruntime = "~1.4.4"
|
||||||
# Before adding new dependency, consider place it in alphabet order (a-z) and suitable group.
|
# Before adding new dependency, consider place it in alphabet order (a-z) and suitable group.
|
||||||
|
|
||||||
############################################################
|
############################################################
|
||||||
|
Loading…
Reference in New Issue
Block a user