feat: enhance backend security and configuration

- Updated Dockerfile to improve security with a non-root user and added health checks. - Modified docker-compose.yml to set containers as read-only, restrict ports to localhost, and implement health checks. - Enhanced .env.example with additional environment variables for security and configuration. - Improved FastAPI application with middleware for security headers, CORS, and body size limits. - Refactored authentication flow in auth.py to include state validation and improved error handling. - Added rate limiting to various endpoints to prevent abuse. - Updated researcher and publication handling to ensure better validation and error management.
2026-05-08 11:19:52 +02:00
parent 96e58dbd16
commit af1b8e9956
37 changed files with 1375 additions and 282 deletions
@@ -0,0 +1,35 @@
+"""
+Middleware que limita el tamaño máximo del cuerpo de la petición.
+
+Evita ataques de agotamiento de memoria/CPU enviando bodies enormes a
+endpoints POST. Se aplica antes de que FastAPI deserialice el JSON.
+"""
+
+from __future__ import annotations
+
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import JSONResponse, Response
+
+
+class BodySizeLimitMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app, *, max_bytes: int):
+        super().__init__(app)
+        self._max_bytes = max_bytes
+
+    async def dispatch(self, request: Request, call_next) -> Response:
+        content_length = request.headers.get("content-length")
+        if content_length is not None:
+            try:
+                if int(content_length) > self._max_bytes:
+                    return JSONResponse(
+                        status_code=413,
+                        content={"detail": "Request body too large"},
+                    )
+            except ValueError:
+                return JSONResponse(
+                    status_code=400,
+                    content={"detail": "Invalid Content-Length header"},
+                )
+
+        return await call_next(request)
@@ -0,0 +1,182 @@
+"""
+Configuración tipada y validada del backend.
+
+Centraliza la lectura de variables de entorno, valida secretos críticos al
+arranque y evita fallbacks inseguros (p. ej. JWT_SECRET="change_me") en
+entornos productivos.
+"""
+
+from __future__ import annotations
+
+import os
+from functools import lru_cache
+from pathlib import Path
+from typing import List, Literal
+from urllib.parse import urlparse
+
+from dotenv import load_dotenv
+from pydantic import Field, field_validator, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+_ENV_PATH = Path(__file__).resolve().parents[2] / ".env"
+load_dotenv(dotenv_path=_ENV_PATH, override=False)
+
+
+def _split_csv(value: str | List[str] | None) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return [str(v).strip().rstrip("/") for v in value if str(v).strip()]
+    return [v.strip().rstrip("/") for v in value.split(",") if v.strip()]
+
+
+class Settings(BaseSettings):
+    """
+    Settings inmutables para toda la aplicación.
+
+    En `production` se aplican validaciones más estrictas:
+    - JWT_SECRET no puede ser un valor débil ni por defecto.
+    - CORS_ALLOWED_ORIGINS no puede contener "*".
+    - Se exige ORCID_CLIENT_ID/SECRET y API_KEY_VALUE.
+    """
+
+    model_config = SettingsConfigDict(
+        env_file=str(_ENV_PATH),
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+    )
+
+    ENVIRONMENT: Literal["development", "staging", "production"] = "development"
+    DEBUG: bool = False
+
+    DATABASE_URL: str = Field(...)
+    REDIS_URL: str | None = None
+    BASE_URL: str = "http://localhost:8000/api"
+
+    JWT_SECRET: str = Field(...)
+    JWT_ALGORITHM: str = "HS256"
+    JWT_EXPIRES_MINUTES: int = 720
+    JWT_ISSUER: str = "orcid-sword-backend"
+    JWT_AUDIENCE: str = "orcid-sword-frontend"
+
+    API_KEY_NAME: str = "X-API-Key"
+    API_KEY_VALUE: str = Field(...)
+
+    ORCID_CLIENT_ID: str = Field(...)
+    ORCID_CLIENT_SECRET: str = Field(...)
+    ORCID_REDIRECT_URI: str = "http://localhost:8000/api/auth/orcid/callback"
+    ORCID_OAUTH_STATE_ENABLED: bool = True
+    ORCID_OAUTH_STATE_COOKIE: str = "orcid_oauth_state"
+    ORCID_OAUTH_STATE_TTL_SECONDS: int = 600
+
+    CORS_ALLOWED_ORIGINS: List[str] = Field(default_factory=list)
+
+    TRUSTED_HOSTS: List[str] = Field(default_factory=lambda: ["*"])
+
+    RATE_LIMIT_DEFAULT: str = "60/minute"
+    RATE_LIMIT_AUTH: str = "10/minute"
+    RATE_LIMIT_SEARCH_ANON: str = "5/minute"
+    RATE_LIMIT_SEARCH_AUTH: str = "30/minute"
+    RATE_LIMIT_EXPORT: str = "20/minute"
+    RATE_LIMIT_SYNC: str = "5/minute"
+
+    MAX_ORCID_BATCH: int = 25
+    MAX_PUB_IDS_BATCH: int = 500
+    MAX_REQUEST_BODY_BYTES: int = 1_048_576  # 1 MiB
+
+    DOCS_ENABLED: bool = True
+
+    SECURITY_HSTS_SECONDS: int = 31_536_000
+    SECURITY_HSTS_INCLUDE_SUBDOMAINS: bool = True
+    SECURITY_HSTS_PRELOAD: bool = False
+
+    @field_validator("CORS_ALLOWED_ORIGINS", mode="before")
+    @classmethod
+    def _parse_cors(cls, v):
+        return _split_csv(v)
+
+    @field_validator("TRUSTED_HOSTS", mode="before")
+    @classmethod
+    def _parse_trusted_hosts(cls, v):
+        parsed = _split_csv(v) if not isinstance(v, list) else v
+        return parsed or ["*"]
+
+    @model_validator(mode="after")
+    def _validate_security(self) -> "Settings":
+        if self.ENVIRONMENT == "production":
+            weak = {"change_me", "changeme", "secret", "password", ""}
+            if self.JWT_SECRET.strip().lower() in weak:
+                raise ValueError(
+                    "JWT_SECRET es débil o está sin configurar. "
+                    "Define un secreto aleatorio fuerte (>= 32 bytes)."
+                )
+            if len(self.JWT_SECRET) < 32:
+                raise ValueError(
+                    "JWT_SECRET debe tener al menos 32 caracteres en producción."
+                )
+            if "*" in self.CORS_ALLOWED_ORIGINS:
+                raise ValueError(
+                    "CORS_ALLOWED_ORIGINS no puede contener '*' en producción."
+                )
+            if not self.CORS_ALLOWED_ORIGINS:
+                raise ValueError(
+                    "CORS_ALLOWED_ORIGINS debe definirse explícitamente en producción."
+                )
+            if not self.API_KEY_VALUE or len(self.API_KEY_VALUE) < 24:
+                raise ValueError(
+                    "API_KEY_VALUE debe tener al menos 24 caracteres en producción."
+                )
+            if self.TRUSTED_HOSTS == ["*"]:
+                raise ValueError(
+                    "TRUSTED_HOSTS debe definirse explícitamente en producción."
+                )
+
+        for origin in self.CORS_ALLOWED_ORIGINS:
+            parsed = urlparse(origin)
+            if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+                raise ValueError(f"Origen CORS inválido: {origin!r}")
+
+        return self
+
+    @property
+    def is_production(self) -> bool:
+        return self.ENVIRONMENT == "production"
+
+    @property
+    def docs_url(self) -> str | None:
+        return "/docs" if self.DOCS_ENABLED else None
+
+    @property
+    def redoc_url(self) -> str | None:
+        return "/redoc" if self.DOCS_ENABLED else None
+
+    @property
+    def openapi_url(self) -> str | None:
+        return "/openapi.json" if self.DOCS_ENABLED else None
+
+
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    """
+    Devuelve la instancia única de configuración.
+
+    Se cachea para no releer entorno/archivos en cada request.
+    """
+    return Settings()  # type: ignore[call-arg]
+
+
+settings = get_settings()
+
+
+def reload_settings_for_tests() -> Settings:
+    """
+    Helper para tests: invalida la caché y recarga settings.
+    """
+    get_settings.cache_clear()
+    globals()["settings"] = get_settings()
+    return globals()["settings"]
+
+
+__all__ = ["Settings", "get_settings", "reload_settings_for_tests", "settings"]
@@ -0,0 +1,67 @@
+"""
+Manejadores de errores que NO filtran información sensible.
+
+- En producción, las excepciones no controladas devuelven un mensaje genérico.
+- En desarrollo, se incluye `type` para depurar (sin trazas).
+- Errores de validación se devuelven con 422 estándar de FastAPI.
+"""
+
+from __future__ import annotations
+
+import logging
+import uuid
+
+from fastapi import HTTPException, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from sqlalchemy.exc import SQLAlchemyError
+
+from app.core.config import settings
+
+
+logger = logging.getLogger("app.error")
+
+
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={"detail": exc.detail},
+        headers=getattr(exc, "headers", None),
+    )
+
+
+async def validation_exception_handler(
+    request: Request, exc: RequestValidationError
+) -> JSONResponse:
+    safe_errors = []
+    for err in exc.errors():
+        safe_errors.append(
+            {
+                "loc": err.get("loc"),
+                "msg": err.get("msg"),
+                "type": err.get("type"),
+            }
+        )
+    return JSONResponse(status_code=422, content={"detail": safe_errors})
+
+
+async def sqlalchemy_exception_handler(
+    request: Request, exc: SQLAlchemyError
+) -> JSONResponse:
+    error_id = str(uuid.uuid4())
+    logger.exception("DB error [%s] on %s %s", error_id, request.method, request.url.path)
+    return JSONResponse(
+        status_code=500,
+        content={"detail": "Database error", "error_id": error_id},
+    )
+
+
+async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+    error_id = str(uuid.uuid4())
+    logger.exception(
+        "Unhandled error [%s] on %s %s", error_id, request.method, request.url.path
+    )
+    payload: dict = {"detail": "Internal server error", "error_id": error_id}
+    if not settings.is_production and settings.DEBUG:
+        payload["type"] = exc.__class__.__name__
+    return JSONResponse(status_code=500, content=payload)
@@ -0,0 +1,28 @@
+"""
+Configuración de logging estructurada y minimalista.
+
+- Formatea con timestamp, nivel y logger.
+- En producción usa nivel INFO; en desarrollo DEBUG.
+- Silencia logs ruidosos de librerías externas para no filtrar headers.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from app.core.config import settings
+
+
+_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s :: %(message)s"
+
+
+def configure_logging() -> None:
+    level = logging.DEBUG if settings.DEBUG else logging.INFO
+
+    logging.basicConfig(level=level, format=_LOG_FORMAT)
+
+    for noisy in ("httpx", "httpcore", "sqlalchemy.engine.Engine"):
+        logging.getLogger(noisy).setLevel(logging.WARNING)
+
+    logging.getLogger("uvicorn.error").setLevel(level)
+    logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
@@ -0,0 +1,60 @@
+"""
+Rate limiting basado en SlowAPI.
+
+- Usa Redis como backend si `REDIS_URL` está definido (compartido entre workers).
+- Cae a memoria local en desarrollo si Redis no está disponible.
+- Identifica al cliente por IP y, cuando hay JWT, también por `sub` (orcid_id),
+  para que un atacante autenticado no comparta cupo con su IP.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from slowapi import Limiter
+from slowapi.errors import RateLimitExceeded
+from slowapi.util import get_remote_address
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+
+from app.core.config import settings
+
+
+def _key_func(request: Request) -> str:
+    """
+    Devuelve la clave de rate limit para el request.
+
+    - Si hay un investigador autenticado en el state, usa su orcid_id.
+    - En caso contrario, usa la IP remota.
+    """
+    researcher = getattr(request.state, "researcher", None)
+    if researcher is not None:
+        return f"user:{getattr(researcher, 'orcid_id', None) or researcher.id}"
+    return f"ip:{get_remote_address(request)}"
+
+
+def _build_limiter() -> Limiter:
+    storage_uri: Optional[str] = settings.REDIS_URL
+    return Limiter(
+        key_func=_key_func,
+        default_limits=[settings.RATE_LIMIT_DEFAULT],
+        storage_uri=storage_uri,
+        headers_enabled=True,
+        strategy="fixed-window-elastic-expiry",
+    )
+
+
+limiter = _build_limiter()
+
+
+def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
+    """
+    Respuesta uniforme cuando se supera el límite.
+
+    No revela límites internos exactos para reducir oráculo a atacantes.
+    """
+    return JSONResponse(
+        status_code=429,
+        content={"detail": "Too many requests, slow down."},
+        headers={"Retry-After": "60"},
+    )
@@ -0,0 +1,88 @@
+"""
+Middleware de cabeceras de seguridad HTTP.
+
+Aplica un perfil seguro por defecto:
+- Strict-Transport-Security (HSTS) — fuerza HTTPS en navegadores compatibles.
+- X-Content-Type-Options: nosniff
+- X-Frame-Options: DENY (clickjacking)
+- Referrer-Policy: strict-origin-when-cross-origin
+- Permissions-Policy: bloquea APIs sensibles por defecto
+- Cross-Origin-Opener-Policy / Resource-Policy: aislamiento del navegador
+- Content-Security-Policy laxa para Swagger/OpenAPI (CDN), restrictiva para el resto.
+
+NOTA: El frontend SPA tiene su propia CSP en su servidor. Aquí
+endurecemos lo que sirve el backend (JSON, XML, ZIP, /docs, /redoc, etc.).
+"""
+
+from __future__ import annotations
+
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import Response
+
+from app.core.config import Settings
+
+
+_DOCS_PATHS = ("/docs", "/redoc", "/openapi.json")
+
+_BASE_CSP = (
+    "default-src 'none'; "
+    "frame-ancestors 'none'; "
+    "base-uri 'none'; "
+    "form-action 'none'"
+)
+
+_SWAGGER_CSP = (
+    "default-src 'self'; "
+    "img-src 'self' data: https://fastapi.tiangolo.com; "
+    "script-src 'self' https://cdn.jsdelivr.net 'unsafe-inline'; "
+    "style-src 'self' https://cdn.jsdelivr.net 'unsafe-inline'; "
+    "font-src 'self' data: https://cdn.jsdelivr.net; "
+    "connect-src 'self'; "
+    "frame-ancestors 'none'; "
+    "base-uri 'self'; "
+    "form-action 'self'"
+)
+
+
+class SecurityHeadersMiddleware(BaseHTTPMiddleware):
+    """
+    Inserta cabeceras de seguridad en cada respuesta.
+    """
+
+    def __init__(self, app, settings: Settings):
+        super().__init__(app)
+        self._settings = settings
+
+    async def dispatch(self, request: Request, call_next) -> Response:
+        response: Response = await call_next(request)
+
+        response.headers.setdefault("X-Content-Type-Options", "nosniff")
+        response.headers.setdefault("X-Frame-Options", "DENY")
+        response.headers.setdefault("Referrer-Policy", "strict-origin-when-cross-origin")
+        response.headers.setdefault(
+            "Permissions-Policy",
+            "geolocation=(), microphone=(), camera=(), payment=(), usb=(), "
+            "accelerometer=(), gyroscope=(), magnetometer=(), interest-cohort=()",
+        )
+        response.headers.setdefault("Cross-Origin-Opener-Policy", "same-origin")
+        response.headers.setdefault("Cross-Origin-Resource-Policy", "same-site")
+        response.headers.setdefault("X-Permitted-Cross-Domain-Policies", "none")
+
+        if request.url.path in _DOCS_PATHS:
+            response.headers.setdefault("Content-Security-Policy", _SWAGGER_CSP)
+        else:
+            response.headers.setdefault("Content-Security-Policy", _BASE_CSP)
+
+        if request.url.scheme == "https" or self._settings.is_production:
+            hsts = f"max-age={self._settings.SECURITY_HSTS_SECONDS}"
+            if self._settings.SECURITY_HSTS_INCLUDE_SUBDOMAINS:
+                hsts += "; includeSubDomains"
+            if self._settings.SECURITY_HSTS_PRELOAD:
+                hsts += "; preload"
+            response.headers.setdefault("Strict-Transport-Security", hsts)
+
+        response.headers.pop("Server", None)
+        response.headers.pop("X-Powered-By", None)
+
+        return response