Files
GenExam-IA/backend/app/services/document_extractor.py
T
Mireya Cueto Garrido 7bc27da33a Add materials, exam images, storage quota, and API guide
Upload documents for AI context, exam images for Moodle questions, per-template storage limits, embedded images in XML export, and GUIA_API_Y_FLUJO.md with full endpoint documentation.
2026-06-01 10:30:40 +02:00

75 lines
2.8 KiB
Python

from pathlib import Path
from app.core.errors import AppError
SUPPORTED_EXTENSIONS = {
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".txt": "text/plain",
".md": "text/markdown",
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".webp": "image/webp",
}
class DocumentExtractor:
def extract(self, file_path: Path, mime_type: str) -> str:
suffix = file_path.suffix.lower()
if mime_type == "application/pdf" or suffix == ".pdf":
return self._extract_pdf(file_path)
if (
mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or suffix == ".docx"
):
return self._extract_docx(file_path)
if mime_type.startswith("text/") or suffix in {".txt", ".md"}:
return self._extract_text(file_path)
if mime_type.startswith("image/") or suffix in {".png", ".jpg", ".jpeg", ".webp"}:
return self._extract_image(file_path)
raise AppError(f"Unsupported file type: {mime_type}", status_code=415, code="unsupported_media")
def _extract_pdf(self, file_path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(file_path))
parts = [page.extract_text() or "" for page in reader.pages]
text = "\n".join(parts).strip()
if not text:
raise AppError("PDF does not contain extractable text", status_code=422, code="empty_extraction")
return text
def _extract_docx(self, file_path: Path) -> str:
from docx import Document
document = Document(str(file_path))
parts = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
text = "\n".join(parts).strip()
if not text:
raise AppError("DOCX does not contain extractable text", status_code=422, code="empty_extraction")
return text
def _extract_text(self, file_path: Path) -> str:
text = file_path.read_text(encoding="utf-8", errors="ignore").strip()
if not text:
raise AppError("Text file is empty", status_code=422, code="empty_extraction")
return text
def _extract_image(self, file_path: Path) -> str:
try:
import pytesseract
from PIL import Image
except ImportError as exc:
raise AppError(
"Image OCR is not available on this server",
status_code=503,
code="ocr_unavailable",
) from exc
image = Image.open(file_path)
text = pytesseract.image_to_string(image, lang="spa+eng").strip()
if not text:
raise AppError("Image does not contain recognizable text", status_code=422, code="empty_extraction")
return text