from pathlib import Path from app.core.errors import AppError SUPPORTED_EXTENSIONS = { ".pdf": "application/pdf", ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ".txt": "text/plain", ".md": "text/markdown", ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".webp": "image/webp", } class DocumentExtractor: def extract(self, file_path: Path, mime_type: str) -> str: suffix = file_path.suffix.lower() if mime_type == "application/pdf" or suffix == ".pdf": return self._extract_pdf(file_path) if ( mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or suffix == ".docx" ): return self._extract_docx(file_path) if mime_type.startswith("text/") or suffix in {".txt", ".md"}: return self._extract_text(file_path) if mime_type.startswith("image/") or suffix in {".png", ".jpg", ".jpeg", ".webp"}: return self._extract_image(file_path) raise AppError(f"Unsupported file type: {mime_type}", status_code=415, code="unsupported_media") def _extract_pdf(self, file_path: Path) -> str: from pypdf import PdfReader reader = PdfReader(str(file_path)) parts = [page.extract_text() or "" for page in reader.pages] text = "\n".join(parts).strip() if not text: raise AppError("PDF does not contain extractable text", status_code=422, code="empty_extraction") return text def _extract_docx(self, file_path: Path) -> str: from docx import Document document = Document(str(file_path)) parts = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()] text = "\n".join(parts).strip() if not text: raise AppError("DOCX does not contain extractable text", status_code=422, code="empty_extraction") return text def _extract_text(self, file_path: Path) -> str: text = file_path.read_text(encoding="utf-8", errors="ignore").strip() if not text: raise AppError("Text file is empty", status_code=422, code="empty_extraction") return text def _extract_image(self, file_path: Path) -> str: try: import pytesseract from PIL import Image except ImportError as exc: raise AppError( "Image OCR is not available on this server", status_code=503, code="ocr_unavailable", ) from exc image = Image.open(file_path) text = pytesseract.image_to_string(image, lang="spa+eng").strip() if not text: raise AppError("Image does not contain recognizable text", status_code=422, code="empty_extraction") return text