Files
Gov-chat-bot/backend/app/services/document_processor.py
2026-03-26 12:49:43 +09:00

89 lines
2.6 KiB
Python

"""
문서 처리 파이프라인:
파싱 → 청킹 → 임베딩 → VectorDB 저장 → Document 레코드 업데이트
"""
from typing import Optional
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.knowledge import Document
from app.providers.embedding import EmbeddingProvider
from app.providers.vectordb import VectorDBProvider
from app.services.parsers.text_parser import extract_text, chunk_text
class DocumentProcessor:
def __init__(
self,
embedding_provider: EmbeddingProvider,
vectordb_provider: VectorDBProvider,
db: AsyncSession,
):
self.embedding = embedding_provider
self.vectordb = vectordb_provider
self.db = db
async def process(self, tenant_id: str, doc: Document, content: bytes) -> int:
"""
문서를 파싱·청킹·임베딩하여 VectorDB에 저장.
chunk_count 반환. 실패 시 0.
"""
# 1. 텍스트 추출
text = extract_text(content, doc.filename)
if not text or not text.strip():
doc.status = "parse_failed"
await self.db.commit()
return 0
# 2. 청킹
chunks = chunk_text(text)
if not chunks:
doc.status = "parse_failed"
await self.db.commit()
return 0
# 3. 임베딩
try:
embeddings = await self.embedding.embed(chunks)
except NotImplementedError:
doc.status = "embedding_unavailable"
await self.db.commit()
return 0
except Exception:
doc.status = "embedding_failed"
await self.db.commit()
return 0
# 4. 메타데이터 구성
published = doc.published_at.strftime("%Y.%m") if doc.published_at else ""
metadatas = [
{
"doc_id": doc.id,
"filename": doc.filename,
"chunk_idx": i,
"published_at": published,
"tenant_id": tenant_id,
}
for i in range(len(chunks))
]
# 5. VectorDB 저장
await self.vectordb.upsert(
tenant_id=tenant_id,
doc_id=doc.id,
chunks=chunks,
embeddings=embeddings,
metadatas=metadatas,
)
# 6. Document 레코드 업데이트
doc.chunk_count = len(chunks)
doc.status = "processed"
await self.db.commit()
return len(chunks)
async def delete(self, tenant_id: str, doc_id: str) -> None:
"""VectorDB에서 문서 청크 삭제."""
await self.vectordb.delete(tenant_id=tenant_id, doc_id=doc_id)