diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 2b0066448e..4d2f61139a 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor): documents = [] # loop over all sheets for sheet in wb.sheets(): - for row_index, row in enumerate(sheet.get_rows(), start=1): - row_header = None + row_header = None + for row_index, row in enumerate(sheet.get_rows(), start=1): if self.is_blank_row(row): continue if row_header is None: @@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor): item_arr = [] for index, cell in enumerate(row): txt_value = str(cell.value) - item_arr.append(f'{row_header[index].value}:{txt_value}') - item_str = "\n".join(item_arr) + item_arr.append(f'"{row_header[index].value}":"{txt_value}"') + item_str = ",".join(item_arr) document = Document(page_content=item_str, metadata={'source': self._file_path}) documents.append(document) return documents @@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor): # transform each row into a Document for _, row in df.iterrows(): - item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) + item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)) document = Document(page_content=item, metadata={'source': self._file_path}) data.append(document) return data