feat: support xlsx file parsing (#304)
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
bbe58327c8
commit
0abd67288b
@ -18,6 +18,7 @@ from controllers.console.setup import setup_required
|
|||||||
from controllers.console.wraps import account_initialization_required
|
from controllers.console.wraps import account_initialization_required
|
||||||
from core.index.readers.html_parser import HTMLParser
|
from core.index.readers.html_parser import HTMLParser
|
||||||
from core.index.readers.pdf_parser import PDFParser
|
from core.index.readers.pdf_parser import PDFParser
|
||||||
|
from core.index.readers.xlsx_parser import XLSXParser
|
||||||
from extensions.ext_storage import storage
|
from extensions.ext_storage import storage
|
||||||
from libs.helper import TimestampField
|
from libs.helper import TimestampField
|
||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
@ -26,7 +27,7 @@ from models.model import UploadFile
|
|||||||
cache = TTLCache(maxsize=None, ttl=30)
|
cache = TTLCache(maxsize=None, ttl=30)
|
||||||
|
|
||||||
FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB
|
FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB
|
||||||
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm']
|
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx']
|
||||||
PREVIEW_WORDS_LIMIT = 3000
|
PREVIEW_WORDS_LIMIT = 3000
|
||||||
|
|
||||||
|
|
||||||
@ -133,6 +134,9 @@ class FilePreviewApi(Resource):
|
|||||||
# Use BeautifulSoup to extract text
|
# Use BeautifulSoup to extract text
|
||||||
parser = HTMLParser()
|
parser = HTMLParser()
|
||||||
text = parser.parse_file(Path(filepath))
|
text = parser.parse_file(Path(filepath))
|
||||||
|
elif extension == 'xlsx':
|
||||||
|
parser = XLSXParser()
|
||||||
|
text = parser.parse_file(filepath)
|
||||||
else:
|
else:
|
||||||
# ['txt', 'markdown', 'md']
|
# ['txt', 'markdown', 'md']
|
||||||
with open(filepath, "rb") as fp:
|
with open(filepath, "rb") as fp:
|
||||||
|
31
api/core/index/readers/xlsx_parser.py
Normal file
31
api/core/index/readers/xlsx_parser.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
from typing import Dict
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
|
from llama_index.readers.file.base_parser import BaseParser
|
||||||
|
from flask import current_app
|
||||||
|
|
||||||
|
|
||||||
|
class XLSXParser(BaseParser):
|
||||||
|
"""XLSX parser."""
|
||||||
|
|
||||||
|
def _init_parser(self) -> Dict:
|
||||||
|
"""Init parser"""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||||||
|
data = []
|
||||||
|
keys = []
|
||||||
|
with open(file, "r") as fp:
|
||||||
|
wb = load_workbook(filename=file, read_only=True)
|
||||||
|
# loop over all sheets
|
||||||
|
for sheet in wb:
|
||||||
|
for row in sheet.iter_rows(values_only=True):
|
||||||
|
if all(v is None for v in row):
|
||||||
|
continue
|
||||||
|
if keys == []:
|
||||||
|
keys = row
|
||||||
|
else:
|
||||||
|
data.append(json.dumps(dict(zip(keys, row)), ensure_ascii=False))
|
||||||
|
return data
|
@ -12,6 +12,8 @@ from llama_index.data_structs import Node
|
|||||||
from llama_index.data_structs.node_v2 import DocumentRelationship
|
from llama_index.data_structs.node_v2 import DocumentRelationship
|
||||||
from llama_index.node_parser import SimpleNodeParser, NodeParser
|
from llama_index.node_parser import SimpleNodeParser, NodeParser
|
||||||
from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
|
from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
|
||||||
|
from llama_index.readers.file.markdown_parser import MarkdownParser
|
||||||
|
from core.index.readers.xlsx_parser import XLSXParser
|
||||||
from core.docstore.dataset_docstore import DatesetDocumentStore
|
from core.docstore.dataset_docstore import DatesetDocumentStore
|
||||||
from core.index.keyword_table_index import KeywordTableIndex
|
from core.index.keyword_table_index import KeywordTableIndex
|
||||||
from core.index.readers.html_parser import HTMLParser
|
from core.index.readers.html_parser import HTMLParser
|
||||||
@ -250,6 +252,7 @@ class IndexingRunner:
|
|||||||
file_extractor[".html"] = HTMLParser()
|
file_extractor[".html"] = HTMLParser()
|
||||||
file_extractor[".htm"] = HTMLParser()
|
file_extractor[".htm"] = HTMLParser()
|
||||||
file_extractor[".pdf"] = PDFParser({'upload_file': upload_file})
|
file_extractor[".pdf"] = PDFParser({'upload_file': upload_file})
|
||||||
|
file_extractor[".xlsx"] = XLSXParser()
|
||||||
|
|
||||||
loader = SimpleDirectoryReader(input_files=[filepath], file_extractor=file_extractor)
|
loader = SimpleDirectoryReader(input_files=[filepath], file_extractor=file_extractor)
|
||||||
text_docs = loader.load_data()
|
text_docs = loader.load_data()
|
||||||
|
@ -29,4 +29,5 @@ sentry-sdk[flask]~=1.21.1
|
|||||||
jieba==0.42.1
|
jieba==0.42.1
|
||||||
celery==5.2.7
|
celery==5.2.7
|
||||||
redis~=4.5.4
|
redis~=4.5.4
|
||||||
pypdf==3.8.1
|
pypdf==3.8.1
|
||||||
|
openpyxl==3.1.2
|
Loading…
Reference in New Issue
Block a user