diff --git a/api/controllers/console/datasets/file.py b/api/controllers/console/datasets/file.py index 75aba5e9eb..f12c372721 100644 --- a/api/controllers/console/datasets/file.py +++ b/api/controllers/console/datasets/file.py @@ -18,6 +18,7 @@ from controllers.console.setup import setup_required from controllers.console.wraps import account_initialization_required from core.index.readers.html_parser import HTMLParser from core.index.readers.pdf_parser import PDFParser +from core.index.readers.xlsx_parser import XLSXParser from extensions.ext_storage import storage from libs.helper import TimestampField from extensions.ext_database import db @@ -26,7 +27,7 @@ from models.model import UploadFile cache = TTLCache(maxsize=None, ttl=30) FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB -ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm'] +ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx'] PREVIEW_WORDS_LIMIT = 3000 @@ -133,6 +134,9 @@ class FilePreviewApi(Resource): # Use BeautifulSoup to extract text parser = HTMLParser() text = parser.parse_file(Path(filepath)) + elif extension == 'xlsx': + parser = XLSXParser() + text = parser.parse_file(filepath) else: # ['txt', 'markdown', 'md'] with open(filepath, "rb") as fp: diff --git a/api/core/index/readers/xlsx_parser.py b/api/core/index/readers/xlsx_parser.py new file mode 100644 index 0000000000..28fd249add --- /dev/null +++ b/api/core/index/readers/xlsx_parser.py @@ -0,0 +1,31 @@ +from pathlib import Path +import json +from typing import Dict +from openpyxl import load_workbook + +from llama_index.readers.file.base_parser import BaseParser +from flask import current_app + + +class XLSXParser(BaseParser): + """XLSX parser.""" + + def _init_parser(self) -> Dict: + """Init parser""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + data = [] + keys = [] + with open(file, "r") as fp: + wb = load_workbook(filename=file, read_only=True) + # loop over all sheets + for sheet in wb: + for row in sheet.iter_rows(values_only=True): + if all(v is None for v in row): + continue + if keys == []: + keys = row + else: + data.append(json.dumps(dict(zip(keys, row)), ensure_ascii=False)) + return data diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 23e87bb1ec..17ae53a907 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -12,6 +12,8 @@ from llama_index.data_structs import Node from llama_index.data_structs.node_v2 import DocumentRelationship from llama_index.node_parser import SimpleNodeParser, NodeParser from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR +from llama_index.readers.file.markdown_parser import MarkdownParser +from core.index.readers.xlsx_parser import XLSXParser from core.docstore.dataset_docstore import DatesetDocumentStore from core.index.keyword_table_index import KeywordTableIndex from core.index.readers.html_parser import HTMLParser @@ -250,6 +252,7 @@ class IndexingRunner: file_extractor[".html"] = HTMLParser() file_extractor[".htm"] = HTMLParser() file_extractor[".pdf"] = PDFParser({'upload_file': upload_file}) + file_extractor[".xlsx"] = XLSXParser() loader = SimpleDirectoryReader(input_files=[filepath], file_extractor=file_extractor) text_docs = loader.load_data() diff --git a/api/requirements.txt b/api/requirements.txt index 511625ccfd..4e04ce5921 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -29,4 +29,5 @@ sentry-sdk[flask]~=1.21.1 jieba==0.42.1 celery==5.2.7 redis~=4.5.4 -pypdf==3.8.1 \ No newline at end of file +pypdf==3.8.1 +openpyxl==3.1.2 \ No newline at end of file