Add materials, exam images, storage quota, and API guide
Upload documents for AI context, exam images for Moodle questions, per-template storage limits, embedded images in XML export, and GUIA_API_Y_FLUJO.md with full endpoint documentation.
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
from pathlib import Path
|
||||
|
||||
from app.core.errors import AppError
|
||||
|
||||
SUPPORTED_EXTENSIONS = {
|
||||
".pdf": "application/pdf",
|
||||
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
".txt": "text/plain",
|
||||
".md": "text/markdown",
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".webp": "image/webp",
|
||||
}
|
||||
|
||||
|
||||
class DocumentExtractor:
|
||||
def extract(self, file_path: Path, mime_type: str) -> str:
|
||||
suffix = file_path.suffix.lower()
|
||||
if mime_type == "application/pdf" or suffix == ".pdf":
|
||||
return self._extract_pdf(file_path)
|
||||
if (
|
||||
mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
or suffix == ".docx"
|
||||
):
|
||||
return self._extract_docx(file_path)
|
||||
if mime_type.startswith("text/") or suffix in {".txt", ".md"}:
|
||||
return self._extract_text(file_path)
|
||||
if mime_type.startswith("image/") or suffix in {".png", ".jpg", ".jpeg", ".webp"}:
|
||||
return self._extract_image(file_path)
|
||||
raise AppError(f"Unsupported file type: {mime_type}", status_code=415, code="unsupported_media")
|
||||
|
||||
def _extract_pdf(self, file_path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(str(file_path))
|
||||
parts = [page.extract_text() or "" for page in reader.pages]
|
||||
text = "\n".join(parts).strip()
|
||||
if not text:
|
||||
raise AppError("PDF does not contain extractable text", status_code=422, code="empty_extraction")
|
||||
return text
|
||||
|
||||
def _extract_docx(self, file_path: Path) -> str:
|
||||
from docx import Document
|
||||
|
||||
document = Document(str(file_path))
|
||||
parts = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
|
||||
text = "\n".join(parts).strip()
|
||||
if not text:
|
||||
raise AppError("DOCX does not contain extractable text", status_code=422, code="empty_extraction")
|
||||
return text
|
||||
|
||||
def _extract_text(self, file_path: Path) -> str:
|
||||
text = file_path.read_text(encoding="utf-8", errors="ignore").strip()
|
||||
if not text:
|
||||
raise AppError("Text file is empty", status_code=422, code="empty_extraction")
|
||||
return text
|
||||
|
||||
def _extract_image(self, file_path: Path) -> str:
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError as exc:
|
||||
raise AppError(
|
||||
"Image OCR is not available on this server",
|
||||
status_code=503,
|
||||
code="ocr_unavailable",
|
||||
) from exc
|
||||
|
||||
image = Image.open(file_path)
|
||||
text = pytesseract.image_to_string(image, lang="spa+eng").strip()
|
||||
if not text:
|
||||
raise AppError("Image does not contain recognizable text", status_code=422, code="empty_extraction")
|
||||
return text
|
||||
Reference in New Issue
Block a user