From 01581dd35fbe936874fb1d43efa225d345e80702 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:52:07 +0800 Subject: [PATCH] improve the notion table extract (#7925) --- api/core/rag/extractor/notion_extractor.py | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/api/core/rag/extractor/notion_extractor.py b/api/core/rag/extractor/notion_extractor.py index 9535455909..2b12b8a4b2 100644 --- a/api/core/rag/extractor/notion_extractor.py +++ b/api/core/rag/extractor/notion_extractor.py @@ -281,20 +281,25 @@ class NotionExtractor(BaseExtractor): for table_header_cell_text in tabel_header_cell: text = table_header_cell_text["text"]["content"] table_header_cell_texts.append(text) - # get table columns text and format + else: + table_header_cell_texts.append('') + # Initialize Markdown table with headers + markdown_table = "| " + " | ".join(table_header_cell_texts) + " |\n" + markdown_table += "| " + " | ".join(['---'] * len(table_header_cell_texts)) + " |\n" + + # Process data to format each row in Markdown table format results = data["results"] for i in range(len(results) - 1): column_texts = [] - tabel_column_cells = data["results"][i + 1]['table_row']['cells'] - for j in range(len(tabel_column_cells)): - if tabel_column_cells[j]: - for table_column_cell_text in tabel_column_cells[j]: + table_column_cells = data["results"][i + 1]['table_row']['cells'] + for j in range(len(table_column_cells)): + if table_column_cells[j]: + for table_column_cell_text in table_column_cells[j]: column_text = table_column_cell_text["text"]["content"] - column_texts.append(f'{table_header_cell_texts[j]}:{column_text}') - - cur_result_text = "\n".join(column_texts) - result_lines_arr.append(cur_result_text) - + column_texts.append(column_text) + # Add row to Markdown table + markdown_table += "| " + " | ".join(column_texts) + " |\n" + result_lines_arr.append(markdown_table) if data["next_cursor"] is None: done = True break