7bc27da33a
Upload documents for AI context, exam images for Moodle questions, per-template storage limits, embedded images in XML export, and GUIA_API_Y_FLUJO.md with full endpoint documentation.
75 lines
2.8 KiB
Python
75 lines
2.8 KiB
Python
from pathlib import Path
|
|
|
|
from app.core.errors import AppError
|
|
|
|
SUPPORTED_EXTENSIONS = {
|
|
".pdf": "application/pdf",
|
|
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
".txt": "text/plain",
|
|
".md": "text/markdown",
|
|
".png": "image/png",
|
|
".jpg": "image/jpeg",
|
|
".jpeg": "image/jpeg",
|
|
".webp": "image/webp",
|
|
}
|
|
|
|
|
|
class DocumentExtractor:
|
|
def extract(self, file_path: Path, mime_type: str) -> str:
|
|
suffix = file_path.suffix.lower()
|
|
if mime_type == "application/pdf" or suffix == ".pdf":
|
|
return self._extract_pdf(file_path)
|
|
if (
|
|
mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
or suffix == ".docx"
|
|
):
|
|
return self._extract_docx(file_path)
|
|
if mime_type.startswith("text/") or suffix in {".txt", ".md"}:
|
|
return self._extract_text(file_path)
|
|
if mime_type.startswith("image/") or suffix in {".png", ".jpg", ".jpeg", ".webp"}:
|
|
return self._extract_image(file_path)
|
|
raise AppError(f"Unsupported file type: {mime_type}", status_code=415, code="unsupported_media")
|
|
|
|
def _extract_pdf(self, file_path: Path) -> str:
|
|
from pypdf import PdfReader
|
|
|
|
reader = PdfReader(str(file_path))
|
|
parts = [page.extract_text() or "" for page in reader.pages]
|
|
text = "\n".join(parts).strip()
|
|
if not text:
|
|
raise AppError("PDF does not contain extractable text", status_code=422, code="empty_extraction")
|
|
return text
|
|
|
|
def _extract_docx(self, file_path: Path) -> str:
|
|
from docx import Document
|
|
|
|
document = Document(str(file_path))
|
|
parts = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
|
|
text = "\n".join(parts).strip()
|
|
if not text:
|
|
raise AppError("DOCX does not contain extractable text", status_code=422, code="empty_extraction")
|
|
return text
|
|
|
|
def _extract_text(self, file_path: Path) -> str:
|
|
text = file_path.read_text(encoding="utf-8", errors="ignore").strip()
|
|
if not text:
|
|
raise AppError("Text file is empty", status_code=422, code="empty_extraction")
|
|
return text
|
|
|
|
def _extract_image(self, file_path: Path) -> str:
|
|
try:
|
|
import pytesseract
|
|
from PIL import Image
|
|
except ImportError as exc:
|
|
raise AppError(
|
|
"Image OCR is not available on this server",
|
|
status_code=503,
|
|
code="ocr_unavailable",
|
|
) from exc
|
|
|
|
image = Image.open(file_path)
|
|
text = pytesseract.image_to_string(image, lang="spa+eng").strip()
|
|
if not text:
|
|
raise AppError("Image does not contain recognizable text", status_code=422, code="empty_extraction")
|
|
return text
|