Files
Gov-chat-bot/backend/app/routers/admin_crawler.py
2026-03-26 12:49:43 +09:00

166 lines
4.8 KiB
Python

"""
크롤러 관리 API.
POST /api/admin/crawler/urls — URL 등록
GET /api/admin/crawler/urls — URL 목록
POST /api/admin/crawler/run/{url_id} — 수동 크롤링 실행
DELETE /api/admin/crawler/urls/{id} — URL 삭제
"""
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Request, status
from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.database import get_db
from app.core.deps import require_editor
from app.models.admin import AdminUser
from app.models.knowledge import CrawlerURL, Document
from app.services.crawler import CrawlerService
from app.services.document_processor import DocumentProcessor
from app.services.audit import log_action
router = APIRouter(prefix="/api/admin/crawler", tags=["admin-crawler"])
class CrawlerURLCreate(BaseModel):
url: str
url_type: str = "page"
interval_hours: int = 24
class CrawlerURLOut(BaseModel):
id: str
url: str
url_type: str
interval_hours: int
is_active: bool
last_crawled: Optional[str] = None
class Config:
from_attributes = True
@router.post("/urls", status_code=status.HTTP_201_CREATED)
async def register_url(
body: CrawlerURLCreate,
db: AsyncSession = Depends(get_db),
current_user: AdminUser = Depends(require_editor),
):
"""크롤러 URL 등록."""
crawler_url = CrawlerURL(
tenant_id=current_user.tenant_id,
url=body.url,
url_type=body.url_type,
interval_hours=body.interval_hours,
is_active=True,
)
db.add(crawler_url)
await db.commit()
await db.refresh(crawler_url)
await log_action(
db=db,
tenant_id=current_user.tenant_id,
actor_id=current_user.id,
actor_type="admin_user",
action="crawler.approve",
target_type="crawler_url",
target_id=crawler_url.id,
diff={"url": body.url},
)
return {"id": crawler_url.id, "url": crawler_url.url}
@router.get("/urls", response_model=list[CrawlerURLOut])
async def list_urls(
db: AsyncSession = Depends(get_db),
current_user: AdminUser = Depends(require_editor),
):
result = await db.execute(
select(CrawlerURL).where(CrawlerURL.tenant_id == current_user.tenant_id)
)
return result.scalars().all()
@router.post("/run/{url_id}")
async def run_crawl(
url_id: str,
request: Request,
db: AsyncSession = Depends(get_db),
current_user: AdminUser = Depends(require_editor),
):
"""수동 크롤링 실행 → 텍스트 추출 → 문서로 저장."""
tenant_id = current_user.tenant_id
result = await db.execute(
select(CrawlerURL).where(CrawlerURL.id == url_id, CrawlerURL.tenant_id == tenant_id)
)
crawler_url = result.scalar_one_or_none()
if not crawler_url:
raise HTTPException(status_code=404, detail="Crawler URL not found")
service = CrawlerService(db)
text = await service.run(crawler_url, tenant_id)
if not text:
raise HTTPException(status_code=422, detail="Failed to crawl or robots.txt disallowed")
# 크롤링 결과를 문서로 저장
from urllib.parse import urlparse
parsed = urlparse(crawler_url.url)
filename = parsed.netloc + parsed.path.replace("/", "_") + ".txt"
doc = Document(
tenant_id=tenant_id,
filename=filename,
source_type="crawler",
source_url=crawler_url.url,
is_active=False, # 편집장 검토 후 승인
status="pending",
)
db.add(doc)
await db.flush()
providers = getattr(request.app.state, "providers", {})
processor = DocumentProcessor(
embedding_provider=providers.get("embedding"),
vectordb_provider=providers.get("vectordb"),
db=db,
)
chunk_count = await processor.process(tenant_id, doc, text.encode("utf-8"))
return {
"doc_id": doc.id,
"url": crawler_url.url,
"chunk_count": chunk_count,
"status": doc.status,
}
@router.delete("/urls/{url_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_url(
url_id: str,
db: AsyncSession = Depends(get_db),
current_user: AdminUser = Depends(require_editor),
):
result = await db.execute(
select(CrawlerURL).where(CrawlerURL.id == url_id, CrawlerURL.tenant_id == current_user.tenant_id)
)
crawler_url = result.scalar_one_or_none()
if not crawler_url:
raise HTTPException(status_code=404, detail="Crawler URL not found")
await db.delete(crawler_url)
await db.commit()
await log_action(
db=db,
tenant_id=current_user.tenant_id,
actor_id=current_user.id,
actor_type="admin_user",
action="crawler.reject",
target_type="crawler_url",
target_id=url_id,
)