Files
Gov-chat-bot/backend/tests/test_document_processor.py
2026-03-26 12:49:43 +09:00

124 lines
3.7 KiB
Python

"""
DocumentProcessor + 텍스트 파서 단위 테스트.
"""
import pytest
from unittest.mock import AsyncMock, MagicMock
from app.services.parsers.text_parser import extract_text, chunk_text
from app.services.document_processor import DocumentProcessor
# ── 파서 테스트 ─────────────────────────────────────────────────────
def test_extract_text_txt():
content = "안녕하세요\n여권 발급 안내입니다".encode("utf-8")
result = extract_text(content, "guide.txt")
assert "여권" in result
def test_extract_text_md():
content = "# 제목\n본문 내용".encode("utf-8")
result = extract_text(content, "readme.md")
assert "본문" in result
def test_extract_text_html():
content = "<html><body><p>여권 안내</p><script>alert(1)</script></body></html>".encode("utf-8")
result = extract_text(content, "page.html")
assert "여권" in result
assert "alert" not in result
def test_extract_text_unsupported_returns_none():
result = extract_text(b"binary", "file.exe")
assert result is None
def test_chunk_text_splits_correctly():
text = "\n".join(["문장 " + str(i) for i in range(50)])
chunks = chunk_text(text, chunk_size=100)
assert len(chunks) > 1
for chunk in chunks:
assert len(chunk) > 0
def test_chunk_text_small_text_single_chunk():
text = "짧은 텍스트"
chunks = chunk_text(text, chunk_size=500)
assert len(chunks) == 1
assert "짧은 텍스트" in chunks[0]
# ── DocumentProcessor 테스트 ────────────────────────────────────────
@pytest.mark.asyncio
async def test_processor_stores_chunks_in_vectordb():
"""파싱 성공 → VectorDB에 upsert 호출."""
embedding = AsyncMock()
embedding.embed = AsyncMock(return_value=[[0.1] * 768, [0.2] * 768])
vectordb = AsyncMock()
vectordb.upsert = AsyncMock(return_value=2)
db = AsyncMock()
db.commit = AsyncMock()
doc = MagicMock()
doc.id = "doc-1"
doc.filename = "test.txt"
doc.published_at = None
doc.status = "pending"
doc.chunk_count = 0
processor = DocumentProcessor(embedding, vectordb, db)
content = "첫 번째 문단입니다.\n두 번째 문단입니다.".encode("utf-8")
count = await processor.process("tenant-1", doc, content)
assert count > 0
assert vectordb.upsert.called
assert doc.status == "processed"
@pytest.mark.asyncio
async def test_processor_fails_on_unsupported_format():
"""지원하지 않는 파일 형식 → chunk_count=0, status=parse_failed."""
embedding = AsyncMock()
vectordb = AsyncMock()
db = AsyncMock()
db.commit = AsyncMock()
doc = MagicMock()
doc.id = "doc-2"
doc.filename = "file.exe"
doc.published_at = None
doc.status = "pending"
processor = DocumentProcessor(embedding, vectordb, db)
count = await processor.process("tenant-1", doc, b"binary data")
assert count == 0
assert doc.status == "parse_failed"
@pytest.mark.asyncio
async def test_processor_handles_embedding_not_implemented():
"""임베딩 미구성 → status=embedding_unavailable."""
embedding = AsyncMock()
embedding.embed = AsyncMock(side_effect=NotImplementedError)
vectordb = AsyncMock()
db = AsyncMock()
db.commit = AsyncMock()
doc = MagicMock()
doc.id = "doc-3"
doc.filename = "guide.txt"
doc.published_at = None
doc.status = "pending"
processor = DocumentProcessor(embedding, vectordb, db)
count = await processor.process("tenant-1", doc, "본문 내용".encode("utf-8"))
assert count == 0
assert doc.status == "embedding_unavailable"