Fix character decoding issues with text-like files

2024-12-15 10:37:15 +05:30
parent 81e3f24acd
commit 52b723724c
4 changed files with 20 additions and 3 deletions
@@ -0,0 +1,4 @@
+–¼‘O,”N—î,�Z�Š
+�²“¡‘¾˜Y,30,“Œ‹ž
+ŽO–Ø‰pŽq,25,‘å�ã
+îà‹´�~,35,–¼ŒÃ‰®
@@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]

+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+

@pytest.mark.skipif(
    skip_remote,
@@ -164,6 +171,12 @@ def test_markitdown_local() -> None:
    for test_string in SERP_TEST_STRINGS:
        assert test_string in text_content

+    ## Test non-UTF-8 encoding
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
+    text_content = result.text_content.replace("\\", "")
+    for test_string in CSV_CP932_TEST_STRINGS:
+        assert test_string in text_content
+

@pytest.mark.skipif(
    skip_exiftool,