Initial commit: import from sinmb79/Gov-chat-bot

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 12:49:43 +09:00
commit a16c972dbb
104 changed files with 8063 additions and 0 deletions
@@ -0,0 +1,96 @@
+"""
+문서 파서 — 1차 정식 지원 형식:
+TXT · MD · DOCX · 텍스트 PDF · HTML
+"""
+import io
+from typing import Optional
+
+
+def parse_txt(content: bytes, encoding: str = "utf-8") -> str:
+    return content.decode(encoding, errors="replace")
+
+
+def parse_md(content: bytes) -> str:
+    return content.decode("utf-8", errors="replace")
+
+
+def parse_html(content: bytes) -> str:
+    from bs4 import BeautifulSoup
+    soup = BeautifulSoup(content, "html.parser")
+    # script/style 제거
+    for tag in soup(["script", "style"]):
+        tag.decompose()
+    return soup.get_text(separator="\n", strip=True)
+
+
+def parse_docx(content: bytes) -> str:
+    from docx import Document
+    doc = Document(io.BytesIO(content))
+    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+
+
+def parse_pdf(content: bytes) -> str:
+    try:
+        import pdfplumber
+        with pdfplumber.open(io.BytesIO(content)) as pdf:
+            pages = []
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    pages.append(text)
+            return "\n".join(pages)
+    except Exception:
+        return ""
+
+
+PARSERS = {
+    "txt": parse_txt,
+    "md": parse_md,
+    "html": parse_html,
+    "htm": parse_html,
+    "docx": parse_docx,
+    "pdf": parse_pdf,
+}
+
+
+def extract_text(content: bytes, filename: str) -> Optional[str]:
+    """파일 확장자에 따라 적절한 파서 선택."""
+    ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
+    parser = PARSERS.get(ext)
+    if not parser:
+        return None
+    try:
+        return parser(content)
+    except Exception:
+        return None
+
+
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
+    """
+    문단 단위 청킹 (약 chunk_size 토큰).
+    overlap: 이전 청크 끝 글자를 다음 청크 시작에 포함.
+    """
+    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
+    chunks = []
+    current = []
+    current_len = 0
+
+    for para in paragraphs:
+        para_len = len(para)
+        if current_len + para_len > chunk_size and current:
+            chunk_text_ = "\n".join(current)
+            chunks.append(chunk_text_)
+            # overlap: 마지막 문단 유지
+            if overlap > 0 and current:
+                current = [current[-1]]
+                current_len = len(current[-1])
+            else:
+                current = []
+                current_len = 0
+        current.append(para)
+        current_len += para_len
+
+    if current:
+        chunks.append("\n".join(current))
+
+    return chunks if chunks else [text[:chunk_size]]