Initial commit: import from sinmb79/Gov-chat-bot
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
문서 처리 파이프라인:
|
||||
파싱 → 청킹 → 임베딩 → VectorDB 저장 → Document 레코드 업데이트
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.models.knowledge import Document
|
||||
from app.providers.embedding import EmbeddingProvider
|
||||
from app.providers.vectordb import VectorDBProvider
|
||||
from app.services.parsers.text_parser import extract_text, chunk_text
|
||||
|
||||
|
||||
class DocumentProcessor:
|
||||
def __init__(
|
||||
self,
|
||||
embedding_provider: EmbeddingProvider,
|
||||
vectordb_provider: VectorDBProvider,
|
||||
db: AsyncSession,
|
||||
):
|
||||
self.embedding = embedding_provider
|
||||
self.vectordb = vectordb_provider
|
||||
self.db = db
|
||||
|
||||
async def process(self, tenant_id: str, doc: Document, content: bytes) -> int:
|
||||
"""
|
||||
문서를 파싱·청킹·임베딩하여 VectorDB에 저장.
|
||||
chunk_count 반환. 실패 시 0.
|
||||
"""
|
||||
# 1. 텍스트 추출
|
||||
text = extract_text(content, doc.filename)
|
||||
if not text or not text.strip():
|
||||
doc.status = "parse_failed"
|
||||
await self.db.commit()
|
||||
return 0
|
||||
|
||||
# 2. 청킹
|
||||
chunks = chunk_text(text)
|
||||
if not chunks:
|
||||
doc.status = "parse_failed"
|
||||
await self.db.commit()
|
||||
return 0
|
||||
|
||||
# 3. 임베딩
|
||||
try:
|
||||
embeddings = await self.embedding.embed(chunks)
|
||||
except NotImplementedError:
|
||||
doc.status = "embedding_unavailable"
|
||||
await self.db.commit()
|
||||
return 0
|
||||
except Exception:
|
||||
doc.status = "embedding_failed"
|
||||
await self.db.commit()
|
||||
return 0
|
||||
|
||||
# 4. 메타데이터 구성
|
||||
published = doc.published_at.strftime("%Y.%m") if doc.published_at else ""
|
||||
metadatas = [
|
||||
{
|
||||
"doc_id": doc.id,
|
||||
"filename": doc.filename,
|
||||
"chunk_idx": i,
|
||||
"published_at": published,
|
||||
"tenant_id": tenant_id,
|
||||
}
|
||||
for i in range(len(chunks))
|
||||
]
|
||||
|
||||
# 5. VectorDB 저장
|
||||
await self.vectordb.upsert(
|
||||
tenant_id=tenant_id,
|
||||
doc_id=doc.id,
|
||||
chunks=chunks,
|
||||
embeddings=embeddings,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
|
||||
# 6. Document 레코드 업데이트
|
||||
doc.chunk_count = len(chunks)
|
||||
doc.status = "processed"
|
||||
await self.db.commit()
|
||||
|
||||
return len(chunks)
|
||||
|
||||
async def delete(self, tenant_id: str, doc_id: str) -> None:
|
||||
"""VectorDB에서 문서 청크 삭제."""
|
||||
await self.vectordb.delete(tenant_id=tenant_id, doc_id=doc_id)
|
||||
Reference in New Issue
Block a user