diff --git a/api/celerybeat-schedule.db b/api/celerybeat-schedule.db deleted file mode 100644 index 33c27aea09..0000000000 Binary files a/api/celerybeat-schedule.db and /dev/null differ diff --git a/api/core/rag/extractor/word_extractor_test.py b/api/core/rag/extractor/word_extractor_test.py deleted file mode 100644 index 99ca5de3ea..0000000000 --- a/api/core/rag/extractor/word_extractor_test.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Abstract interface for document loader implementations.""" -import os -import tempfile -from urllib.parse import urlparse - -import requests -from docx import Document as DocxDocument - -from core.rag.extractor.extractor_base import BaseExtractor -from core.rag.models.document import Document - - -class WordExtractorTest(BaseExtractor): - """Load docx files. - - - Args: - file_path: Path to the file to load. - """ - - def __init__(self, file_path: str): - """Initialize with file path.""" - self.file_path = file_path - if "~" in self.file_path: - self.file_path = os.path.expanduser(self.file_path) - - # If the file is a web path, download it to a temporary file, and use that - if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): - r = requests.get(self.file_path) - - if r.status_code != 200: - raise ValueError( - f"Check the url of your file; returned status code {r.status_code}" - ) - - self.web_path = self.file_path - self.temp_file = tempfile.NamedTemporaryFile() - self.temp_file.write(r.content) - self.file_path = self.temp_file.name - elif not os.path.isfile(self.file_path): - raise ValueError(f"File path {self.file_path} is not a valid file or url") - - def __del__(self) -> None: - if hasattr(self, "temp_file"): - self.temp_file.close() - - def extract(self) -> list[Document]: - """Load given path as single page.""" - from docx import Document as docx_Document - - document = docx_Document(self.file_path) - doc_texts = [paragraph.text for paragraph in document.paragraphs] - content = '\n'.join(doc_texts) - - return [Document( - page_content=content, - metadata={"source": self.file_path}, - )] - - @staticmethod - def _is_valid_url(url: str) -> bool: - """Check if the url is valid.""" - parsed = urlparse(url) - return bool(parsed.netloc) and bool(parsed.scheme) - - def _extract_images_from_docx(self, doc, image_folder): - image_count = 0 - image_paths = [] - - for rel in doc.part.rels.values(): - if "image" in rel.target_ref: - image_count += 1 - image_ext = rel.target_ref.split('.')[-1] - image_name = f"image{image_count}.{image_ext}" - image_path = os.path.join(image_folder, image_name) - with open(image_path, "wb") as img_file: - img_file.write(rel.target_part.blob) - image_paths.append(f"![](/api/system/img/{image_name})") - - return image_paths - - def _table_to_html(self, table): - html = "" - for row in table.rows: - html += "" - for cell in row.cells: - html += f"" - html += "" - html += "
{cell.text}
" - return html - - def parse_docx(self, docx_path, image_folder): - doc = DocxDocument(docx_path) - os.makedirs(image_folder, exist_ok=True) - - content = [] - - image_index = 0 - image_paths = self._extract_images_from_docx(doc, image_folder) - - for element in doc.element.body: - if element.tag.endswith('p'): # paragraph - paragraph = element.text.strip() - if paragraph: - content.append(paragraph) - elif element.tag.endswith('tbl'): # table - table = doc.tables[image_index] - content.append(self._table_to_html(table)) - image_index += 1 - - # 替换图片占位符 - content_with_images = [] - for item in content: - if '!' in item and '[]' in item: - item = image_paths.pop(0) - content_with_images.append(item) - - return content_with_images diff --git a/api/tests/unit_tests/oss/local/test_local.py b/api/tests/unit_tests/oss/local/test_local.py deleted file mode 100644 index 697a853669..0000000000 --- a/api/tests/unit_tests/oss/local/test_local.py +++ /dev/null @@ -1,63 +0,0 @@ -import os -import unittest -from unittest.mock import mock_open, patch - -from extensions.storage.local_storage import LocalStorage - - -class TestLocalStorage(unittest.TestCase): - def setUp(self): - # Configuration for each test - self.app_config = {'root': '/test'} - self.folder = 'test_folder/' - self.storage = LocalStorage(self.app_config, self.folder) - - @patch('os.makedirs') - def test_save(self, mock_makedirs): - # Test the save functionality - test_data = b"test data" - with patch('builtins.open', mock_open()) as mocked_file: - self.storage.save('file.txt', test_data) - mocked_file.assert_called_with(os.path.join(os.getcwd(), 'test_folder/file.txt'), "wb") - handle = mocked_file() - handle.write.assert_called_once_with(test_data) - - @patch('os.path.exists', return_value=True) - @patch('builtins.open', new_callable=mock_open, read_data=b"test data") - def test_load_once(self, mock_open, mock_exists): - # Test the load_once method - data = self.storage.load_once('file.txt') - self.assertEqual(data, b"test data") - - @patch('os.path.exists', return_value=True) - def test_load_stream(self, mock_exists): - # Test the load_stream method - with patch('builtins.open', mock_open(read_data=b"test data")) as mocked_file: - generator = self.storage.load_stream('file.txt') - output = list(generator) - self.assertEqual(output, [b'test data']) - - @patch('shutil.copyfile') - @patch('os.path.exists', return_value=True) - def test_download(self, mock_exists, mock_copyfile): - # Test the download method - self.storage.download('file.txt', 'target.txt') - mock_copyfile.assert_called_once_with('test_folder/file.txt', 'target.txt') - - @patch('os.path.exists', return_value=True) - def test_exists(self, mock_exists): - # Test the exists method - self.assertTrue(self.storage.exists('file.txt')) - - @patch('os.path.exists', return_value=True) - @patch('os.remove') - def test_delete(self, mock_remove, mock_exists): - # Test the delete method - self.storage.delete('file.txt') - mock_remove.assert_called_once_with('test_folder/file.txt') - - @patch('os.path.exists', return_value=False) - def test_delete_file_not_found(self, mock_exists): - # Test deleting a file that does not exist - with self.assertRaises(FileNotFoundError): - self.storage.delete('file.txt') diff --git a/api/tests/unit_tests/oss/test_oss.py b/api/tests/unit_tests/oss/test_oss.py deleted file mode 100644 index 32d2916c35..0000000000 --- a/api/tests/unit_tests/oss/test_oss.py +++ /dev/null @@ -1,61 +0,0 @@ -from collections.abc import Generator -from unittest.mock import MagicMock - -import pytest - -from extensions import ext_redis - - -def get_example_filename() -> str: - return 'test_text.txt' - - -def get_example_file_data() -> bytes: - return b'test_text' - - -@pytest.fixture -def setup_mock_redis() -> None: - # get - ext_redis.redis_client.get = MagicMock(return_value=None) - - # set - ext_redis.redis_client.set = MagicMock(return_value=None) - - # lock - mock_redis_lock = MagicMock() - mock_redis_lock.__enter__ = MagicMock() - mock_redis_lock.__exit__ = MagicMock() - ext_redis.redis_client.lock = mock_redis_lock - - -class AbstractOssTest: - def __init__(self): - self.client = None - self.filename = get_example_filename() - self.data = get_example_file_data() - - def save(self): - raise NotImplementedError - - def load_once(self) -> bytes: - raise NotImplementedError - - def load_stream(self) -> Generator: - raise NotImplementedError - - def download(self): - raise NotImplementedError - - def exists(self): - raise NotImplementedError - - def delete(self): - raise NotImplementedError - - def run_all_tests(self): - self.save() - self.load_once() - self.load_stream() - self.exists() - self.delete()