Files
Gov-chat-bot/backend/tests/test_crawler.py
2026-03-26 12:49:43 +09:00

93 lines
3.1 KiB
Python

"""
크롤러 서비스 단위 테스트.
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from app.services.crawler import crawl_url, check_robots_txt, CrawlerService
from app.services.parsers.text_parser import extract_text
@pytest.mark.asyncio
async def test_crawl_url_extracts_text():
"""정상 HTML 응답 → 텍스트 추출."""
html_str = "<html><body><p>동두천시 여권 안내</p><script>bad()</script></body></html>"
html = html_str.encode("utf-8")
mock_response = MagicMock()
mock_response.content = html
mock_response.text = html_str
mock_response.headers = {"content-type": "text/html"}
mock_response.raise_for_status = MagicMock()
mock_client = AsyncMock()
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=None)
mock_client.get = AsyncMock(return_value=mock_response)
with patch("app.services.crawler.check_robots_txt", return_value=True), \
patch("httpx.AsyncClient", return_value=mock_client):
result = await crawl_url("https://www.example.com/guide")
assert result is not None
assert "동두천시" in result
assert "bad()" not in result
@pytest.mark.asyncio
async def test_crawl_url_returns_none_on_robots_disallow():
"""robots.txt 불허 → None 반환."""
with patch("app.services.crawler.check_robots_txt", return_value=False):
result = await crawl_url("https://www.example.com/private")
assert result is None
@pytest.mark.asyncio
async def test_crawl_url_returns_none_on_network_error():
"""네트워크 오류 → None 반환 (예외 미전파)."""
import httpx
mock_client = AsyncMock()
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=None)
mock_client.get = AsyncMock(side_effect=httpx.ConnectError("connection refused"))
with patch("app.services.crawler.check_robots_txt", return_value=True), \
patch("httpx.AsyncClient", return_value=mock_client):
result = await crawl_url("https://unreachable.example.com")
assert result is None
@pytest.mark.asyncio
async def test_crawler_service_updates_last_crawled():
"""run() 후 last_crawled 업데이트."""
db = AsyncMock()
db.commit = AsyncMock()
crawler_url = MagicMock()
crawler_url.url = "https://www.example.com"
crawler_url.last_crawled = None
with patch("app.services.crawler.crawl_url", return_value="크롤링된 내용"):
service = CrawlerService(db)
result = await service.run(crawler_url, "tenant-1")
assert result == "크롤링된 내용"
assert crawler_url.last_crawled is not None
db.commit.assert_called_once()
@pytest.mark.asyncio
async def test_crawler_service_returns_none_on_fail():
"""크롤링 실패 → None 반환."""
db = AsyncMock()
crawler_url = MagicMock()
crawler_url.url = "https://fail.example.com"
with patch("app.services.crawler.crawl_url", return_value=None):
service = CrawlerService(db)
result = await service.run(crawler_url, "tenant-1")
assert result is None