From eeb2c28526e93a234c83e20aa82226e16a9d3d7b Mon Sep 17 00:00:00 2001 From: Columbus <36625222+AtmosphereMao@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:12:07 +0800 Subject: [PATCH] =?UTF-8?q?Fix=20the=20issue=20of=20decoding=20a=20non-UTF?= =?UTF-8?q?-8=20encoded=20file=20using=20UTF-8=20encodi=E2=80=A6=20(#378)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/controllers/console/datasets/file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/api/controllers/console/datasets/file.py b/api/controllers/console/datasets/file.py index f12c372721..5db0446175 100644 --- a/api/controllers/console/datasets/file.py +++ b/api/controllers/console/datasets/file.py @@ -1,6 +1,7 @@ import datetime import hashlib import tempfile +import chardet import time import uuid from pathlib import Path @@ -141,7 +142,8 @@ class FilePreviewApi(Resource): # ['txt', 'markdown', 'md'] with open(filepath, "rb") as fp: data = fp.read() - text = data.decode(encoding='utf-8').strip() if data else '' + encoding = chardet.detect(data)['encoding'] + text = data.decode(encoding=encoding).strip() if data else '' text = text[0:PREVIEW_WORDS_LIMIT] if text else '' return {'content': text}