From af1b8e995684f937680977df48725d362d1e9424 Mon Sep 17 00:00:00 2001 From: Mireya Cueto Garrido Date: Fri, 8 May 2026 11:19:52 +0200 Subject: [PATCH 1/2] feat: enhance backend security and configuration - Updated Dockerfile to improve security with a non-root user and added health checks. - Modified docker-compose.yml to set containers as read-only, restrict ports to localhost, and implement health checks. - Enhanced .env.example with additional environment variables for security and configuration. - Improved FastAPI application with middleware for security headers, CORS, and body size limits. - Refactored authentication flow in auth.py to include state validation and improved error handling. - Added rate limiting to various endpoints to prevent abuse. - Updated researcher and publication handling to ensure better validation and error management. --- .gitignore | 7 + backend/.dockerignore | 20 ++ backend/.env.example | 78 +++++++- backend/Dockerfile | 32 ++- backend/app/api/auth.py | 112 +++++++---- backend/app/api/export.py | 182 ++++++++++-------- backend/app/api/researchers.py | 99 ++++++---- backend/app/core/__init__.py | 0 backend/app/core/body_size.py | 35 ++++ backend/app/core/config.py | 182 ++++++++++++++++++ backend/app/core/error_handlers.py | 67 +++++++ backend/app/core/logging_config.py | 28 +++ backend/app/core/rate_limit.py | 60 ++++++ backend/app/core/security_headers.py | 88 +++++++++ backend/app/db/base.py | 4 + backend/app/db/models.py | 9 + .../db/repositories/publication_repository.py | 20 ++ .../db/repositories/researcher_repository.py | 15 ++ .../app/db/repositories/syncjob_repository.py | 11 ++ backend/app/db/session.py | 10 + backend/app/main.py | 152 +++++++++++---- backend/app/scheduler/sync_scheduler.py | 10 + backend/app/schema/auth.py | 6 + backend/app/schema/export.py | 23 +++ backend/app/schema/publication.py | 4 + backend/app/schema/researcher.py | 35 +++- backend/app/security/api_key.py | 59 +++--- backend/app/security/jwt.py | 125 +++++++++--- backend/app/security/oauth_state.py | 76 ++++++++ backend/app/services/normalizer.py | 6 + backend/app/services/orcid_client.py | 10 + backend/app/services/sword_generator.py | 3 + backend/app/services/sync_service.py | 15 ++ backend/app/services/zip_generator.py | 6 +- backend/app/utils/orcid_validator.py | 19 +- backend/requirements.txt | 8 +- docker-compose.yml | 41 ++-- 37 files changed, 1375 insertions(+), 282 deletions(-) create mode 100644 backend/.dockerignore create mode 100644 backend/app/core/__init__.py create mode 100644 backend/app/core/body_size.py create mode 100644 backend/app/core/config.py create mode 100644 backend/app/core/error_handlers.py create mode 100644 backend/app/core/logging_config.py create mode 100644 backend/app/core/rate_limit.py create mode 100644 backend/app/core/security_headers.py create mode 100644 backend/app/schema/export.py create mode 100644 backend/app/security/oauth_state.py diff --git a/.gitignore b/.gitignore index 1cc1173..31265ac 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,13 @@ ENV/ # FastAPI / Uvicorn *.pid +# Test / type checkers +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ + # --- NODE FRONTEND --- node_modules/ dist/ diff --git a/backend/.dockerignore b/backend/.dockerignore new file mode 100644 index 0000000..42336ce --- /dev/null +++ b/backend/.dockerignore @@ -0,0 +1,20 @@ +.env +.env.* +!.env.example +__pycache__/ +*.pyc +*.pyo +*.pyd +*.log +*.sqlite3 +*.db +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.venv/ +venv/ +.git/ +.gitignore +README.md +docs/ +tests/ diff --git a/backend/.env.example b/backend/.env.example index be02114..c6b9fe5 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,19 +1,81 @@ -ORCID_CLIENT_ID=123412341234 -ORCID_CLIENT_SECRET=123412341234 - -API_KEY_NAME=X-API-Key -API_KEY_VALUE=123412341234 +# ============================================================ +# ENVIRONMENT +# ============================================================ +ENVIRONMENT=development +DEBUG=false +# ============================================================ +# DATABASE / CACHE +# ============================================================ DATABASE_URL=postgresql://postgres:postgres@db:5432/orcid_db REDIS_URL=redis://redis:6379/0 +# ============================================================ +# BASE URL (uso interno del scheduler) +# ============================================================ BASE_URL=http://localhost:8000/api +# ============================================================ +# CORS — lista blanca estricta separada por comas +# Nunca uses "*" si allow_credentials=true. +# ============================================================ +CORS_ALLOWED_ORIGINS=http://localhost:5173 + +# ============================================================ +# Trusted Hosts — anti Host-header injection (en prod, sé explícito) +# ============================================================ +TRUSTED_HOSTS=* + +# ============================================================ # JWT (login ORCID) -JWT_SECRET=change_me +# Genera un secreto fuerte: `openssl rand -base64 64` +# ============================================================ +JWT_SECRET=change_me_to_a_long_random_value_at_least_32_chars JWT_ALGORITHM=HS256 JWT_EXPIRES_MINUTES=720 +JWT_ISSUER=orcid-sword-backend +JWT_AUDIENCE=orcid-sword-frontend +# ============================================================ +# API key máquina-a-máquina (scheduler interno) +# Genera con: `python -c "import secrets;print(secrets.token_urlsafe(48))"` +# ============================================================ +API_KEY_NAME=X-API-Key +API_KEY_VALUE=replace_with_a_strong_random_value_min_24_chars + +# ============================================================ # ORCID OAuth 3-legged (authorization code) -# Debe coincidir exactamente con el redirect URI configurado en tu app ORCID. -ORCID_REDIRECT_URI=http://localhost:8000/api/auth/orcid/callback \ No newline at end of file +# ============================================================ +ORCID_CLIENT_ID=APP-XXXXXXXXXXXXXXXX +ORCID_CLIENT_SECRET=replace_me +ORCID_REDIRECT_URI=http://localhost:8000/api/auth/orcid/callback +ORCID_OAUTH_STATE_ENABLED=true + +# ============================================================ +# Rate limits (formato slowapi: "/") +# ============================================================ +RATE_LIMIT_DEFAULT=60/minute +RATE_LIMIT_AUTH=10/minute +RATE_LIMIT_SEARCH_ANON=5/minute +RATE_LIMIT_SEARCH_AUTH=30/minute +RATE_LIMIT_EXPORT=20/minute +RATE_LIMIT_SYNC=5/minute + +# ============================================================ +# Tope de tamaños (anti DoS) +# ============================================================ +MAX_ORCID_BATCH=25 +MAX_PUB_IDS_BATCH=500 +MAX_REQUEST_BODY_BYTES=1048576 + +# ============================================================ +# Documentación interactiva (deshabilita en producción si no es necesaria) +# ============================================================ +DOCS_ENABLED=true + +# ============================================================ +# HSTS +# ============================================================ +SECURITY_HSTS_SECONDS=31536000 +SECURITY_HSTS_INCLUDE_SUBDOMAINS=true +SECURITY_HSTS_PRELOAD=false diff --git a/backend/Dockerfile b/backend/Dockerfile index e3f2064..5251f77 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,10 +1,36 @@ -FROM python:3.12-slim +FROM python:3.12-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* + +RUN groupadd --system --gid 1001 app \ + && useradd --system --uid 1001 --gid app --home /app --shell /usr/sbin/nologin app WORKDIR /app -COPY requirements.txt . +COPY requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt COPY app ./app -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +RUN chown -R app:app /app + +USER app + +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ + CMD curl -fsS http://127.0.0.1:8000/health || exit 1 + +CMD ["uvicorn", "app.main:app", \ + "--host", "0.0.0.0", \ + "--port", "8000", \ + "--proxy-headers", \ + "--forwarded-allow-ips", "*", \ + "--no-server-header"] diff --git a/backend/app/api/auth.py b/backend/app/api/auth.py index 205cb95..89ecc96 100644 --- a/backend/app/api/auth.py +++ b/backend/app/api/auth.py @@ -1,64 +1,68 @@ +import logging + import httpx -import os -from pathlib import Path -from dotenv import load_dotenv -from fastapi import APIRouter, Depends, HTTPException, status -from fastapi.responses import RedirectResponse +from fastapi import APIRouter, Depends, HTTPException, Request, status +from fastapi.responses import JSONResponse, RedirectResponse from sqlalchemy.orm import Session +from app.core.config import settings +from app.core.rate_limit import limiter from app.db.models import Researcher from app.db.session import get_db from app.schema.auth import OrcidLoginResponseSchema from app.security.jwt import create_access_token +from app.security.oauth_state import ( + attach_state_cookie, + clear_state_cookie, + generate_state, + validate_state, +) from app.services.orcid_client import ORCIDClient from app.utils.orcid_validator import is_valid_orcid -# Asegura que al ejecutar `uvicorn` local también se carga `backend/.env`. -_ENV_PATH = Path(__file__).resolve().parents[2] / ".env" -load_dotenv(dotenv_path=_ENV_PATH, override=False) - router = APIRouter(prefix="/auth", tags=["auth"]) +logger = logging.getLogger("app.auth") def _extract_display_name(record: dict) -> str | None: person = (record or {}).get("person") or {} name = person.get("name") or {} - given = ((name.get("given-names") or {}).get("value")) if isinstance(name.get("given-names"), dict) else None - family = ((name.get("family-name") or {}).get("value")) if isinstance(name.get("family-name"), dict) else None - full = " ".join([p for p in [given, family] if p]) + given_obj = name.get("given-names") + family_obj = name.get("family-name") + given = given_obj.get("value") if isinstance(given_obj, dict) else None + family = family_obj.get("value") if isinstance(family_obj, dict) else None + full = " ".join(p for p in [given, family] if p) return full or None def _orcid_redirect_uri() -> str: - # Debe coincidir con el `redirect_uri` registrado en tu integración ORCID. - return os.getenv("ORCID_REDIRECT_URI") or "http://localhost:8000/api/auth/orcid/callback" + return settings.ORCID_REDIRECT_URI def _complete_oauth_login(*, code: str, db: Session) -> OrcidLoginResponseSchema: """ - Completa el login OAuth: - 1) intercambio del `code` en ORCID (server-side) - 2) crea/actualiza el investigador - 3) emite nuestro JWT + 1) Intercambia el `code` con ORCID (server-side). + 2) Crea/actualiza el investigador. + 3) Emite el JWT propio. """ - if not code: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Missing ORCID authorization code") + if not code or len(code) > 256: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid ORCID authorization code") client = ORCIDClient() - redirect_uri = _orcid_redirect_uri() - try: - token_data = client.exchange_authorization_code(code=code, redirect_uri=redirect_uri) + token_data = client.exchange_authorization_code(code=code, redirect_uri=_orcid_redirect_uri()) except httpx.HTTPStatusError as exc: + logger.warning("ORCID token exchange failed: %s", exc.response.status_code) raise HTTPException( status_code=status.HTTP_502_BAD_GATEWAY, - detail=f"ORCID token error ({exc.response.status_code})", - ) - except httpx.TimeoutException: - raise HTTPException(status_code=status.HTTP_504_GATEWAY_TIMEOUT, detail="ORCID timeout") - except Exception: - raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="ORCID unavailable") + detail="ORCID token exchange failed", + ) from exc + except httpx.TimeoutException as exc: + raise HTTPException(status_code=status.HTTP_504_GATEWAY_TIMEOUT, detail="ORCID timeout") from exc + except Exception as exc: + logger.exception("Unexpected error during ORCID token exchange") + raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="ORCID unavailable") from exc orcid_id = (token_data.get("orcid") or "").strip() if not is_valid_orcid(orcid_id): @@ -66,7 +70,6 @@ def _complete_oauth_login(*, code: str, db: Session) -> OrcidLoginResponseSchema display_name = token_data.get("name") if not display_name: - # Fallback si ORCID no devuelve `name` en el token response. try: record = client.fetch_record(orcid_id) display_name = _extract_display_name(record) @@ -89,21 +92,54 @@ def _complete_oauth_login(*, code: str, db: Session) -> OrcidLoginResponseSchema return OrcidLoginResponseSchema(access_token=token) -@router.get("/orcid/authorize") -def authorize_orcid(): +def complete_oauth_login_response( + *, request: Request, code: str, state: str | None, db: Session +) -> JSONResponse: """ - Inicia el flujo OAuth 3-legged (authorization code) hacia ORCID. + Valida `state`, completa el login y limpia la cookie del state. + Devuelve directamente la JSONResponse (para poder borrar cookie). + """ + validate_state(request, state) + payload = _complete_oauth_login(code=code, db=db) + json_resp = JSONResponse(content=payload.model_dump()) + clear_state_cookie(json_resp) + return json_resp + + +# --------------------------------------------------------- +# ENDPOINT 1: Iniciar flujo OAuth 3-legged hacia ORCID +# --------------------------------------------------------- + +@router.get("/orcid/authorize") +@limiter.limit(settings.RATE_LIMIT_AUTH) +def authorize_orcid(request: Request): + """ + Genera la URL de autorización ORCID y persiste el `state` en cookie + HttpOnly para validarlo en el callback (anti-CSRF). """ client = ORCIDClient() + state = generate_state() if settings.ORCID_OAUTH_STATE_ENABLED else None authorize_url = client.build_authorize_url( redirect_uri=_orcid_redirect_uri(), - # Solo necesitamos el Authenticated iD del usuario. scope="/authenticate", + state=state, ) - return RedirectResponse(authorize_url) + response = RedirectResponse(authorize_url) + if state: + attach_state_cookie(response, state) + return response +# --------------------------------------------------------- +# ENDPOINT 2: Callback OAuth 3-legged desde ORCID +# --------------------------------------------------------- + @router.get("/orcid/callback", response_model=OrcidLoginResponseSchema) -def orcid_callback(code: str, db: Session = Depends(get_db)): - return _complete_oauth_login(code=code, db=db) - +@limiter.limit(settings.RATE_LIMIT_AUTH) +def orcid_callback( + request: Request, + code: str, + state: str | None = None, + db: Session = Depends(get_db), +): + return complete_oauth_login_response(request=request, code=code, state=state, db=db) diff --git a/backend/app/api/export.py b/backend/app/api/export.py index 2152105..7e20fd5 100644 --- a/backend/app/api/export.py +++ b/backend/app/api/export.py @@ -1,115 +1,146 @@ -from fastapi import APIRouter, Depends, HTTPException -from fastapi.responses import Response -from sqlalchemy.orm import Session +from typing import Iterable, List from uuid import UUID +from fastapi import APIRouter, Body, Depends, HTTPException, Path, Request +from fastapi.responses import Response +from sqlalchemy.orm import Session + +from app.core.config import settings +from app.core.rate_limit import limiter +from app.db.models import Publication, PublicationDownload, Researcher from app.db.session import get_db -from app.db.models import Publication, Researcher, PublicationDownload from app.security.api_key import get_api_key_optional from app.security.jwt import get_optional_current_researcher from app.services.sword_generator import SWORDGenerator from app.services.zip_generator import ZIPGenerator +from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid + router = APIRouter(prefix="/export") -def validate_uuid_list(pub_ids: list[str]) -> list[UUID]: - valid_ids = [] - for pid in pub_ids: - try: - valid_ids.append(UUID(pid)) - except Exception: - raise HTTPException( - status_code=400, - detail=f"Invalid publication ID (not UUID): {pid}" - ) - return valid_ids +def _ensure_credentials(api_key: str | None, current: Researcher | None) -> None: + if not api_key and not current: + raise HTTPException(status_code=401, detail="Authentication required") +def _record_downloads(db: Session, current: Researcher, pubs: Iterable[Publication]) -> None: + """ + Inserta marcadores de descarga (researcher_id, publication_id). + + - Resuelve descargas existentes con UNA sola query. + - Solo añade las que faltan. + """ + pub_ids = [p.id for p in pubs] + if not pub_ids: + return + + existing_ids = { + row[0] + for row in ( + db.query(PublicationDownload.publication_id) + .filter( + PublicationDownload.researcher_id == current.id, + PublicationDownload.publication_id.in_(pub_ids), + ) + .all() + ) + } + + new_rows = [ + PublicationDownload(researcher_id=current.id, publication_id=pid) + for pid in pub_ids + if pid not in existing_ids + ] + if new_rows: + db.add_all(new_rows) + db.commit() + + +def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]: + if len(pub_ids) > settings.MAX_PUB_IDS_BATCH: + raise HTTPException(status_code=413, detail="Too many publication IDs") + return pub_ids + + +# --------------------------------------------------------- +# ENDPOINT 1: SWORD múltiples publicaciones +# --------------------------------------------------------- + @router.post("/sword/publications") +@limiter.limit(settings.RATE_LIMIT_EXPORT) async def export_multiple_sword( - pub_ids: list[str], + request: Request, + pub_ids: List[UUID] = Body(..., min_length=1, max_length=settings.MAX_PUB_IDS_BATCH), db: Session = Depends(get_db), api_key: str | None = Depends(get_api_key_optional), current: Researcher | None = Depends(get_optional_current_researcher), ): - if not api_key and not current: - raise HTTPException(status_code=401, detail="Missing credentials") - validate_uuid_list(pub_ids) + _ensure_credentials(api_key, current) + _validate_pub_ids(pub_ids) pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all() - if not pubs: raise HTTPException(status_code=404, detail="No publications found") researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first() xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs) - # Registrar descarga solo si hay usuario logueado if current: - for p in pubs: - exists = ( - db.query(PublicationDownload) - .filter( - PublicationDownload.researcher_id == current.id, - PublicationDownload.publication_id == p.id, - ) - .first() - ) - if not exists: - db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id)) - db.commit() + _record_downloads(db, current, pubs) + return Response(content=xml_bytes, media_type="application/xml") +# --------------------------------------------------------- +# ENDPOINT 2: SWORD por investigador +# --------------------------------------------------------- + @router.get("/sword/researcher/{orcid_id}") +@limiter.limit(settings.RATE_LIMIT_EXPORT) async def export_researcher_sword( - orcid_id: str, + request: Request, + orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN), db: Session = Depends(get_db), api_key: str | None = Depends(get_api_key_optional), current: Researcher | None = Depends(get_optional_current_researcher), ): - if not api_key and not current: - raise HTTPException(status_code=401, detail="Missing credentials") + _ensure_credentials(api_key, current) + if not is_valid_orcid(orcid_id): + raise HTTPException(status_code=400, detail="Invalid ORCID iD") + researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first() if not researcher: raise HTTPException(status_code=404, detail="Researcher not found") pubs = db.query(Publication).filter_by(researcher_id=researcher.id).all() - if not pubs: raise HTTPException(status_code=404, detail="No publications found for this researcher") xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs) if current: - for p in pubs: - exists = ( - db.query(PublicationDownload) - .filter( - PublicationDownload.researcher_id == current.id, - PublicationDownload.publication_id == p.id, - ) - .first() - ) - if not exists: - db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id)) - db.commit() + _record_downloads(db, current, pubs) + return Response(content=xml_bytes, media_type="application/xml") +# --------------------------------------------------------- +# ENDPOINT 3: ZIP múltiples publicaciones +# --------------------------------------------------------- + @router.post("/zip/publications") +@limiter.limit(settings.RATE_LIMIT_EXPORT) async def export_multiple_zip( - pub_ids: list[str], + request: Request, + pub_ids: List[UUID] = Body(..., min_length=1, max_length=settings.MAX_PUB_IDS_BATCH), db: Session = Depends(get_db), api_key: str | None = Depends(get_api_key_optional), current: Researcher | None = Depends(get_optional_current_researcher), ): - if not api_key and not current: - raise HTTPException(status_code=401, detail="Missing credentials") - validate_uuid_list(pub_ids) + _ensure_credentials(api_key, current) + _validate_pub_ids(pub_ids) pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all() - if not pubs: raise HTTPException(status_code=404, detail="No publications found") @@ -117,51 +148,38 @@ async def export_multiple_zip( zip_bytes = ZIPGenerator.generate_zip(researcher, pubs) if current: - for p in pubs: - exists = ( - db.query(PublicationDownload) - .filter( - PublicationDownload.researcher_id == current.id, - PublicationDownload.publication_id == p.id, - ) - .first() - ) - if not exists: - db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id)) - db.commit() + _record_downloads(db, current, pubs) + return Response(content=zip_bytes, media_type="application/zip") +# --------------------------------------------------------- +# ENDPOINT 4: ZIP por investigador +# --------------------------------------------------------- + @router.get("/zip/researcher/{orcid_id}") +@limiter.limit(settings.RATE_LIMIT_EXPORT) async def export_researcher_zip( - orcid_id: str, + request: Request, + orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN), db: Session = Depends(get_db), api_key: str | None = Depends(get_api_key_optional), current: Researcher | None = Depends(get_optional_current_researcher), ): - if not api_key and not current: - raise HTTPException(status_code=401, detail="Missing credentials") + _ensure_credentials(api_key, current) + if not is_valid_orcid(orcid_id): + raise HTTPException(status_code=400, detail="Invalid ORCID iD") + researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first() if not researcher: raise HTTPException(status_code=404, detail="Researcher not found") pubs = db.query(Publication).filter_by(researcher_id=researcher.id).all() - if not pubs: raise HTTPException(status_code=404, detail="No publications found for this researcher") zip_bytes = ZIPGenerator.generate_zip(researcher, pubs) if current: - for p in pubs: - exists = ( - db.query(PublicationDownload) - .filter( - PublicationDownload.researcher_id == current.id, - PublicationDownload.publication_id == p.id, - ) - .first() - ) - if not exists: - db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id)) - db.commit() + _record_downloads(db, current, pubs) + return Response(content=zip_bytes, media_type="application/zip") diff --git a/backend/app/api/researchers.py b/backend/app/api/researchers.py index c13b8e3..c8df04d 100644 --- a/backend/app/api/researchers.py +++ b/backend/app/api/researchers.py @@ -2,11 +2,14 @@ from datetime import datetime from typing import List import httpx -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Path, Request from sqlalchemy.orm import Session -from app.db.models import Publication, Researcher +from app.core.config import settings +from app.core.rate_limit import limiter +from app.db.models import Publication, PublicationDownload, Researcher from app.db.session import get_db +from app.schema.publication import PublicationSchema from app.schema.researcher import ( ResearcherBatchSearchRequestSchema, ResearcherBatchSearchResponseSchema, @@ -14,18 +17,15 @@ from app.schema.researcher import ( ResearcherStatsSchema, ResearcherWithPublicationsSchema, ) +from app.security.jwt import get_current_researcher, get_optional_current_researcher from app.services.normalizer import PublicationNormalizer -from app.services.orcid_client import get_display_name, get_works_summary, get_work_detail -from app.schema.publication import PublicationSchema -from app.db.models import PublicationDownload -from app.security.jwt import get_optional_current_researcher +from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary +from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid + router = APIRouter(prefix="/researchers", tags=["researchers"]) -# --------------------------------------------------------- -# Función auxiliar: detectar si una publicación ha cambiado -# --------------------------------------------------------- def publication_changed(existing: Publication, data: dict) -> bool: fields = [ "title", "subtitle", "type", "journal", @@ -33,18 +33,13 @@ def publication_changed(existing: Publication, data: dict) -> bool: "doi", "url", "short_description", "citation_type", "citation_value", "language_code", "country", - "external_ids", "contributors" + "external_ids", "contributors", ] - - for f in fields: - if getattr(existing, f) != data[f]: - return True - return False + return any(getattr(existing, f) != data[f] for f in fields) def build_researcher_stats(publications: list) -> ResearcherStatsSchema: publication_types: dict[str, int] = {} - for publication in publications: pub_type = getattr(publication, "type", None) or "unknown" publication_types[pub_type] = publication_types.get(pub_type, 0) + 1 @@ -98,7 +93,7 @@ def _upsert_researcher_publications( "doi", "url", "short_description", "citation_type", "citation_value", "language_code", "country", - "external_ids", "contributors" + "external_ids", "contributors", ]: setattr(existing, field, data[field]) existing.last_modified = datetime.utcnow() @@ -142,12 +137,17 @@ def _decorate_downloaded_by_me( out: List[PublicationSchema] = [] for p in publications: out.append( - PublicationSchema.model_validate(p).model_copy(update={"downloaded_by_me": p.id in downloaded_ids}) + PublicationSchema.model_validate(p).model_copy( + update={"downloaded_by_me": p.id in downloaded_ids} + ) ) return out def build_search_response(orcid_id: str, db: Session, current: Researcher | None) -> ResearcherWithPublicationsSchema: + if not is_valid_orcid(orcid_id): + raise HTTPException(status_code=400, detail="Invalid ORCID iD") + researcher = db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first() if not researcher: researcher = Researcher( @@ -159,10 +159,6 @@ def build_search_response(orcid_id: str, db: Session, current: Researcher | None db.add(researcher) db.flush() - # Si todavía no conocemos el nombre del investigador (por ejemplo, recién - # creado al sincronizarse desde el buscador), lo resolvemos contra el - # endpoint `/record` público de ORCID. No tocamos un nombre ya existente - # para no pisar valores establecidos por el flujo de autenticación. if not researcher.name: display_name = get_display_name(orcid_id) if display_name: @@ -185,10 +181,27 @@ def build_search_response(orcid_id: str, db: Session, current: Researcher | None # --------------------------------------------------------- -# ENDPOINT 1: SEARCH + SYNC (sin contadores) +# ENDPOINT 1: SEARCH + SYNC # --------------------------------------------------------- -@router.post("/search", response_model=ResearcherBatchSearchResponseSchema, response_model_exclude_none=True) + +def _search_rate_limit(request: Request) -> str: + """ + Aplica un límite distinto si el usuario está autenticado. + Como SlowAPI evalúa el decorador antes de las dependencias, devolvemos + el límite más restrictivo y subimos sólo si hay token (state.researcher). + """ + researcher = getattr(request.state, "researcher", None) + return settings.RATE_LIMIT_SEARCH_AUTH if researcher else settings.RATE_LIMIT_SEARCH_ANON + + +@router.post( + "/search", + response_model=ResearcherBatchSearchResponseSchema, + response_model_exclude_none=True, +) +@limiter.limit(_search_rate_limit) def search_and_sync_researchers( + request: Request, payload: ResearcherBatchSearchRequestSchema, db: Session = Depends(get_db), current: Researcher | None = Depends(get_optional_current_researcher), @@ -196,26 +209,33 @@ def search_and_sync_researchers( results: List[ResearcherWithPublicationsSchema] = [] errors: List[ResearcherSearchErrorSchema] = [] - # Evita llamadas duplicadas a ORCID conservando el orden de entrada. unique_orcid_ids = list(dict.fromkeys(payload.orcid_ids)) for orcid_id in unique_orcid_ids: try: results.append(build_search_response(orcid_id, db, current)) + except HTTPException as exc: + db.rollback() + errors.append( + ResearcherSearchErrorSchema( + orcid_id=orcid_id, + detail=str(exc.detail), + ) + ) except httpx.HTTPStatusError as exc: db.rollback() errors.append( ResearcherSearchErrorSchema( orcid_id=orcid_id, - detail=f"ORCID devolvió {exc.response.status_code} para {orcid_id}.", + detail=f"ORCID returned {exc.response.status_code}", ) ) - except Exception as exc: + except Exception: db.rollback() errors.append( ResearcherSearchErrorSchema( orcid_id=orcid_id, - detail=str(exc), + detail="Unexpected error while processing ORCID iD", ) ) @@ -228,14 +248,24 @@ def search_and_sync_researchers( # --------------------------------------------------------- -# ENDPOINT 2: SYNC COMPLETO (con contadores + status) +# ENDPOINT 2: SYNC COMPLETO (requiere autenticación) # --------------------------------------------------------- -@router.post("/{orcid_id}/sync", response_model=ResearcherWithPublicationsSchema, response_model_exclude_none=True) + +@router.post( + "/{orcid_id}/sync", + response_model=ResearcherWithPublicationsSchema, + response_model_exclude_none=True, +) +@limiter.limit(settings.RATE_LIMIT_SYNC) def sync_researcher( - orcid_id: str, + request: Request, + orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN), db: Session = Depends(get_db), - current: Researcher | None = Depends(get_optional_current_researcher), + current: Researcher = Depends(get_current_researcher), ): + if not is_valid_orcid(orcid_id): + raise HTTPException(status_code=400, detail="Invalid ORCID iD") + researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first() if not researcher: raise HTTPException(status_code=404, detail="Researcher not found") @@ -244,7 +274,6 @@ def sync_researcher( groups = works.get("group", []) publications_output = [] - new_count = 0 updated_count = 0 unchanged_count = 0 @@ -277,21 +306,17 @@ def sync_researcher( if existing: if publication_changed(existing, data): - # updated for field in data: setattr(existing, field, data[field]) existing.last_modified = datetime.utcnow() existing.status = "updated" updated_count += 1 else: - # unchanged existing.status = "unchanged" unchanged_count += 1 pub = existing - else: - # new pub = Publication( researcher_id=researcher.id, **data, diff --git a/backend/app/core/__init__.py b/backend/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/core/body_size.py b/backend/app/core/body_size.py new file mode 100644 index 0000000..323e01f --- /dev/null +++ b/backend/app/core/body_size.py @@ -0,0 +1,35 @@ +""" +Middleware que limita el tamaño máximo del cuerpo de la petición. + +Evita ataques de agotamiento de memoria/CPU enviando bodies enormes a +endpoints POST. Se aplica antes de que FastAPI deserialice el JSON. +""" + +from __future__ import annotations + +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import JSONResponse, Response + + +class BodySizeLimitMiddleware(BaseHTTPMiddleware): + def __init__(self, app, *, max_bytes: int): + super().__init__(app) + self._max_bytes = max_bytes + + async def dispatch(self, request: Request, call_next) -> Response: + content_length = request.headers.get("content-length") + if content_length is not None: + try: + if int(content_length) > self._max_bytes: + return JSONResponse( + status_code=413, + content={"detail": "Request body too large"}, + ) + except ValueError: + return JSONResponse( + status_code=400, + content={"detail": "Invalid Content-Length header"}, + ) + + return await call_next(request) diff --git a/backend/app/core/config.py b/backend/app/core/config.py new file mode 100644 index 0000000..f69d92e --- /dev/null +++ b/backend/app/core/config.py @@ -0,0 +1,182 @@ +""" +Configuración tipada y validada del backend. + +Centraliza la lectura de variables de entorno, valida secretos críticos al +arranque y evita fallbacks inseguros (p. ej. JWT_SECRET="change_me") en +entornos productivos. +""" + +from __future__ import annotations + +import os +from functools import lru_cache +from pathlib import Path +from typing import List, Literal +from urllib.parse import urlparse + +from dotenv import load_dotenv +from pydantic import Field, field_validator, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +_ENV_PATH = Path(__file__).resolve().parents[2] / ".env" +load_dotenv(dotenv_path=_ENV_PATH, override=False) + + +def _split_csv(value: str | List[str] | None) -> List[str]: + if value is None: + return [] + if isinstance(value, list): + return [str(v).strip().rstrip("/") for v in value if str(v).strip()] + return [v.strip().rstrip("/") for v in value.split(",") if v.strip()] + + +class Settings(BaseSettings): + """ + Settings inmutables para toda la aplicación. + + En `production` se aplican validaciones más estrictas: + - JWT_SECRET no puede ser un valor débil ni por defecto. + - CORS_ALLOWED_ORIGINS no puede contener "*". + - Se exige ORCID_CLIENT_ID/SECRET y API_KEY_VALUE. + """ + + model_config = SettingsConfigDict( + env_file=str(_ENV_PATH), + env_file_encoding="utf-8", + extra="ignore", + case_sensitive=False, + ) + + ENVIRONMENT: Literal["development", "staging", "production"] = "development" + DEBUG: bool = False + + DATABASE_URL: str = Field(...) + REDIS_URL: str | None = None + BASE_URL: str = "http://localhost:8000/api" + + JWT_SECRET: str = Field(...) + JWT_ALGORITHM: str = "HS256" + JWT_EXPIRES_MINUTES: int = 720 + JWT_ISSUER: str = "orcid-sword-backend" + JWT_AUDIENCE: str = "orcid-sword-frontend" + + API_KEY_NAME: str = "X-API-Key" + API_KEY_VALUE: str = Field(...) + + ORCID_CLIENT_ID: str = Field(...) + ORCID_CLIENT_SECRET: str = Field(...) + ORCID_REDIRECT_URI: str = "http://localhost:8000/api/auth/orcid/callback" + ORCID_OAUTH_STATE_ENABLED: bool = True + ORCID_OAUTH_STATE_COOKIE: str = "orcid_oauth_state" + ORCID_OAUTH_STATE_TTL_SECONDS: int = 600 + + CORS_ALLOWED_ORIGINS: List[str] = Field(default_factory=list) + + TRUSTED_HOSTS: List[str] = Field(default_factory=lambda: ["*"]) + + RATE_LIMIT_DEFAULT: str = "60/minute" + RATE_LIMIT_AUTH: str = "10/minute" + RATE_LIMIT_SEARCH_ANON: str = "5/minute" + RATE_LIMIT_SEARCH_AUTH: str = "30/minute" + RATE_LIMIT_EXPORT: str = "20/minute" + RATE_LIMIT_SYNC: str = "5/minute" + + MAX_ORCID_BATCH: int = 25 + MAX_PUB_IDS_BATCH: int = 500 + MAX_REQUEST_BODY_BYTES: int = 1_048_576 # 1 MiB + + DOCS_ENABLED: bool = True + + SECURITY_HSTS_SECONDS: int = 31_536_000 + SECURITY_HSTS_INCLUDE_SUBDOMAINS: bool = True + SECURITY_HSTS_PRELOAD: bool = False + + @field_validator("CORS_ALLOWED_ORIGINS", mode="before") + @classmethod + def _parse_cors(cls, v): + return _split_csv(v) + + @field_validator("TRUSTED_HOSTS", mode="before") + @classmethod + def _parse_trusted_hosts(cls, v): + parsed = _split_csv(v) if not isinstance(v, list) else v + return parsed or ["*"] + + @model_validator(mode="after") + def _validate_security(self) -> "Settings": + if self.ENVIRONMENT == "production": + weak = {"change_me", "changeme", "secret", "password", ""} + if self.JWT_SECRET.strip().lower() in weak: + raise ValueError( + "JWT_SECRET es débil o está sin configurar. " + "Define un secreto aleatorio fuerte (>= 32 bytes)." + ) + if len(self.JWT_SECRET) < 32: + raise ValueError( + "JWT_SECRET debe tener al menos 32 caracteres en producción." + ) + if "*" in self.CORS_ALLOWED_ORIGINS: + raise ValueError( + "CORS_ALLOWED_ORIGINS no puede contener '*' en producción." + ) + if not self.CORS_ALLOWED_ORIGINS: + raise ValueError( + "CORS_ALLOWED_ORIGINS debe definirse explícitamente en producción." + ) + if not self.API_KEY_VALUE or len(self.API_KEY_VALUE) < 24: + raise ValueError( + "API_KEY_VALUE debe tener al menos 24 caracteres en producción." + ) + if self.TRUSTED_HOSTS == ["*"]: + raise ValueError( + "TRUSTED_HOSTS debe definirse explícitamente en producción." + ) + + for origin in self.CORS_ALLOWED_ORIGINS: + parsed = urlparse(origin) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise ValueError(f"Origen CORS inválido: {origin!r}") + + return self + + @property + def is_production(self) -> bool: + return self.ENVIRONMENT == "production" + + @property + def docs_url(self) -> str | None: + return "/docs" if self.DOCS_ENABLED else None + + @property + def redoc_url(self) -> str | None: + return "/redoc" if self.DOCS_ENABLED else None + + @property + def openapi_url(self) -> str | None: + return "/openapi.json" if self.DOCS_ENABLED else None + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + """ + Devuelve la instancia única de configuración. + + Se cachea para no releer entorno/archivos en cada request. + """ + return Settings() # type: ignore[call-arg] + + +settings = get_settings() + + +def reload_settings_for_tests() -> Settings: + """ + Helper para tests: invalida la caché y recarga settings. + """ + get_settings.cache_clear() + globals()["settings"] = get_settings() + return globals()["settings"] + + +__all__ = ["Settings", "get_settings", "reload_settings_for_tests", "settings"] diff --git a/backend/app/core/error_handlers.py b/backend/app/core/error_handlers.py new file mode 100644 index 0000000..52803fc --- /dev/null +++ b/backend/app/core/error_handlers.py @@ -0,0 +1,67 @@ +""" +Manejadores de errores que NO filtran información sensible. + +- En producción, las excepciones no controladas devuelven un mensaje genérico. +- En desarrollo, se incluye `type` para depurar (sin trazas). +- Errores de validación se devuelven con 422 estándar de FastAPI. +""" + +from __future__ import annotations + +import logging +import uuid + +from fastapi import HTTPException, Request +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse +from sqlalchemy.exc import SQLAlchemyError + +from app.core.config import settings + + +logger = logging.getLogger("app.error") + + +async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: + return JSONResponse( + status_code=exc.status_code, + content={"detail": exc.detail}, + headers=getattr(exc, "headers", None), + ) + + +async def validation_exception_handler( + request: Request, exc: RequestValidationError +) -> JSONResponse: + safe_errors = [] + for err in exc.errors(): + safe_errors.append( + { + "loc": err.get("loc"), + "msg": err.get("msg"), + "type": err.get("type"), + } + ) + return JSONResponse(status_code=422, content={"detail": safe_errors}) + + +async def sqlalchemy_exception_handler( + request: Request, exc: SQLAlchemyError +) -> JSONResponse: + error_id = str(uuid.uuid4()) + logger.exception("DB error [%s] on %s %s", error_id, request.method, request.url.path) + return JSONResponse( + status_code=500, + content={"detail": "Database error", "error_id": error_id}, + ) + + +async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse: + error_id = str(uuid.uuid4()) + logger.exception( + "Unhandled error [%s] on %s %s", error_id, request.method, request.url.path + ) + payload: dict = {"detail": "Internal server error", "error_id": error_id} + if not settings.is_production and settings.DEBUG: + payload["type"] = exc.__class__.__name__ + return JSONResponse(status_code=500, content=payload) diff --git a/backend/app/core/logging_config.py b/backend/app/core/logging_config.py new file mode 100644 index 0000000..0dd3f7c --- /dev/null +++ b/backend/app/core/logging_config.py @@ -0,0 +1,28 @@ +""" +Configuración de logging estructurada y minimalista. + +- Formatea con timestamp, nivel y logger. +- En producción usa nivel INFO; en desarrollo DEBUG. +- Silencia logs ruidosos de librerías externas para no filtrar headers. +""" + +from __future__ import annotations + +import logging + +from app.core.config import settings + + +_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s :: %(message)s" + + +def configure_logging() -> None: + level = logging.DEBUG if settings.DEBUG else logging.INFO + + logging.basicConfig(level=level, format=_LOG_FORMAT) + + for noisy in ("httpx", "httpcore", "sqlalchemy.engine.Engine"): + logging.getLogger(noisy).setLevel(logging.WARNING) + + logging.getLogger("uvicorn.error").setLevel(level) + logging.getLogger("uvicorn.access").setLevel(logging.WARNING) diff --git a/backend/app/core/rate_limit.py b/backend/app/core/rate_limit.py new file mode 100644 index 0000000..d216609 --- /dev/null +++ b/backend/app/core/rate_limit.py @@ -0,0 +1,60 @@ +""" +Rate limiting basado en SlowAPI. + +- Usa Redis como backend si `REDIS_URL` está definido (compartido entre workers). +- Cae a memoria local en desarrollo si Redis no está disponible. +- Identifica al cliente por IP y, cuando hay JWT, también por `sub` (orcid_id), + para que un atacante autenticado no comparta cupo con su IP. +""" + +from __future__ import annotations + +from typing import Optional + +from slowapi import Limiter +from slowapi.errors import RateLimitExceeded +from slowapi.util import get_remote_address +from starlette.requests import Request +from starlette.responses import JSONResponse + +from app.core.config import settings + + +def _key_func(request: Request) -> str: + """ + Devuelve la clave de rate limit para el request. + + - Si hay un investigador autenticado en el state, usa su orcid_id. + - En caso contrario, usa la IP remota. + """ + researcher = getattr(request.state, "researcher", None) + if researcher is not None: + return f"user:{getattr(researcher, 'orcid_id', None) or researcher.id}" + return f"ip:{get_remote_address(request)}" + + +def _build_limiter() -> Limiter: + storage_uri: Optional[str] = settings.REDIS_URL + return Limiter( + key_func=_key_func, + default_limits=[settings.RATE_LIMIT_DEFAULT], + storage_uri=storage_uri, + headers_enabled=True, + strategy="fixed-window-elastic-expiry", + ) + + +limiter = _build_limiter() + + +def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse: + """ + Respuesta uniforme cuando se supera el límite. + + No revela límites internos exactos para reducir oráculo a atacantes. + """ + return JSONResponse( + status_code=429, + content={"detail": "Too many requests, slow down."}, + headers={"Retry-After": "60"}, + ) diff --git a/backend/app/core/security_headers.py b/backend/app/core/security_headers.py new file mode 100644 index 0000000..9de3eff --- /dev/null +++ b/backend/app/core/security_headers.py @@ -0,0 +1,88 @@ +""" +Middleware de cabeceras de seguridad HTTP. + +Aplica un perfil seguro por defecto: +- Strict-Transport-Security (HSTS) — fuerza HTTPS en navegadores compatibles. +- X-Content-Type-Options: nosniff +- X-Frame-Options: DENY (clickjacking) +- Referrer-Policy: strict-origin-when-cross-origin +- Permissions-Policy: bloquea APIs sensibles por defecto +- Cross-Origin-Opener-Policy / Resource-Policy: aislamiento del navegador +- Content-Security-Policy laxa para Swagger/OpenAPI (CDN), restrictiva para el resto. + +NOTA: El frontend SPA tiene su propia CSP en su servidor. Aquí +endurecemos lo que sirve el backend (JSON, XML, ZIP, /docs, /redoc, etc.). +""" + +from __future__ import annotations + +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response + +from app.core.config import Settings + + +_DOCS_PATHS = ("/docs", "/redoc", "/openapi.json") + +_BASE_CSP = ( + "default-src 'none'; " + "frame-ancestors 'none'; " + "base-uri 'none'; " + "form-action 'none'" +) + +_SWAGGER_CSP = ( + "default-src 'self'; " + "img-src 'self' data: https://fastapi.tiangolo.com; " + "script-src 'self' https://cdn.jsdelivr.net 'unsafe-inline'; " + "style-src 'self' https://cdn.jsdelivr.net 'unsafe-inline'; " + "font-src 'self' data: https://cdn.jsdelivr.net; " + "connect-src 'self'; " + "frame-ancestors 'none'; " + "base-uri 'self'; " + "form-action 'self'" +) + + +class SecurityHeadersMiddleware(BaseHTTPMiddleware): + """ + Inserta cabeceras de seguridad en cada respuesta. + """ + + def __init__(self, app, settings: Settings): + super().__init__(app) + self._settings = settings + + async def dispatch(self, request: Request, call_next) -> Response: + response: Response = await call_next(request) + + response.headers.setdefault("X-Content-Type-Options", "nosniff") + response.headers.setdefault("X-Frame-Options", "DENY") + response.headers.setdefault("Referrer-Policy", "strict-origin-when-cross-origin") + response.headers.setdefault( + "Permissions-Policy", + "geolocation=(), microphone=(), camera=(), payment=(), usb=(), " + "accelerometer=(), gyroscope=(), magnetometer=(), interest-cohort=()", + ) + response.headers.setdefault("Cross-Origin-Opener-Policy", "same-origin") + response.headers.setdefault("Cross-Origin-Resource-Policy", "same-site") + response.headers.setdefault("X-Permitted-Cross-Domain-Policies", "none") + + if request.url.path in _DOCS_PATHS: + response.headers.setdefault("Content-Security-Policy", _SWAGGER_CSP) + else: + response.headers.setdefault("Content-Security-Policy", _BASE_CSP) + + if request.url.scheme == "https" or self._settings.is_production: + hsts = f"max-age={self._settings.SECURITY_HSTS_SECONDS}" + if self._settings.SECURITY_HSTS_INCLUDE_SUBDOMAINS: + hsts += "; includeSubDomains" + if self._settings.SECURITY_HSTS_PRELOAD: + hsts += "; preload" + response.headers.setdefault("Strict-Transport-Security", hsts) + + response.headers.pop("Server", None) + response.headers.pop("X-Powered-By", None) + + return response diff --git a/backend/app/db/base.py b/backend/app/db/base.py index 59be703..d350806 100644 --- a/backend/app/db/base.py +++ b/backend/app/db/base.py @@ -1,3 +1,7 @@ from sqlalchemy.orm import declarative_base +# --------------------------------------------------------- +# Base de datos +# --------------------------------------------------------- + Base = declarative_base() diff --git a/backend/app/db/models.py b/backend/app/db/models.py index ae61527..e4138b4 100644 --- a/backend/app/db/models.py +++ b/backend/app/db/models.py @@ -6,6 +6,9 @@ from datetime import datetime from app.db.session import Base +# --------------------------------------------------------- +# Modelo de investigador +# --------------------------------------------------------- class Researcher(Base): __tablename__ = "researchers" @@ -18,6 +21,9 @@ class Researcher(Base): publications = relationship("Publication", back_populates="researcher", cascade="all, delete-orphan") +# --------------------------------------------------------- +# Modelo de publicación +# --------------------------------------------------------- class Publication(Base): __tablename__ = "publications" @@ -65,6 +71,9 @@ class Publication(Base): # Legacy: descargado global (deprecado). Mantener por compatibilidad de DB. downloaded = Column(Boolean, nullable=False, default=False) +# --------------------------------------------------------- +# Modelo de descarga de publicación +# --------------------------------------------------------- class PublicationDownload(Base): """ diff --git a/backend/app/db/repositories/publication_repository.py b/backend/app/db/repositories/publication_repository.py index 590010b..ca23694 100644 --- a/backend/app/db/repositories/publication_repository.py +++ b/backend/app/db/repositories/publication_repository.py @@ -1,8 +1,16 @@ from sqlalchemy.orm import Session from app.db.models import Publication +# --------------------------------------------------------- +# Repositorio de publicaciones +# --------------------------------------------------------- + class PublicationRepository: + # --------------------------------------------------------- + # Función auxiliar: obtener publicación por put_code + # --------------------------------------------------------- + @staticmethod def get_by_put_code(db: Session, researcher_id: str, put_code: int): """ @@ -17,6 +25,10 @@ class PublicationRepository: .first() ) + # --------------------------------------------------------- + # Función auxiliar: crear una nueva publicación + # --------------------------------------------------------- + @staticmethod def create(db: Session, researcher_id: str, data: dict): """ @@ -37,6 +49,10 @@ class PublicationRepository: db.refresh(pub) return pub + # --------------------------------------------------------- + # Función auxiliar: actualizar una publicación existente + # --------------------------------------------------------- + @staticmethod def update(db: Session, publication: Publication, data: dict): """ @@ -53,6 +69,10 @@ class PublicationRepository: db.refresh(publication) return publication + # --------------------------------------------------------- + # Función auxiliar: listar publicaciones de un investigador + # --------------------------------------------------------- + @staticmethod def list_by_researcher(db: Session, researcher_id: str): """ diff --git a/backend/app/db/repositories/researcher_repository.py b/backend/app/db/repositories/researcher_repository.py index 4aba7af..1b8c3b2 100644 --- a/backend/app/db/repositories/researcher_repository.py +++ b/backend/app/db/repositories/researcher_repository.py @@ -2,13 +2,24 @@ from sqlalchemy.orm import Session from app.db.models import Researcher from sqlalchemy.sql import func +# --------------------------------------------------------- +# Repositorio de investigadores +# --------------------------------------------------------- class ResearcherRepository: + # --------------------------------------------------------- + # Función auxiliar: obtener investigador por ORCID ID + # --------------------------------------------------------- + @staticmethod def get_by_orcid(db: Session, orcid_id: str): return db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first() + # --------------------------------------------------------- + # Función auxiliar: crear un nuevo investigador + # --------------------------------------------------------- + @staticmethod def create(db: Session, orcid_id: str, name: str = None): researcher = Researcher(orcid_id=orcid_id, name=name) @@ -17,6 +28,10 @@ class ResearcherRepository: db.refresh(researcher) return researcher + # --------------------------------------------------------- + # Función auxiliar: actualizar la última sincronización + # --------------------------------------------------------- + @staticmethod def update_last_sync(db: Session, researcher: Researcher): researcher.last_sync_at = func.now() diff --git a/backend/app/db/repositories/syncjob_repository.py b/backend/app/db/repositories/syncjob_repository.py index 1cb00a1..7860789 100644 --- a/backend/app/db/repositories/syncjob_repository.py +++ b/backend/app/db/repositories/syncjob_repository.py @@ -2,9 +2,16 @@ from sqlalchemy.orm import Session from app.db.models import SyncJob from sqlalchemy.sql import func +# --------------------------------------------------------- +# Repositorio de trabajos de sincronización +# --------------------------------------------------------- class SyncJobRepository: + # --------------------------------------------------------- + # Función auxiliar: iniciar un nuevo trabajo de sincronización + # --------------------------------------------------------- + @staticmethod def start_job(db: Session, researcher_id: str): job = SyncJob( @@ -17,6 +24,10 @@ class SyncJobRepository: db.refresh(job) return job + # --------------------------------------------------------- + # Función auxiliar: finalizar un trabajo de sincronización + # --------------------------------------------------------- + @staticmethod def finish_job(db: Session, job: SyncJob, new_records: int, updated_records: int): job.status = "finished" diff --git a/backend/app/db/session.py b/backend/app/db/session.py index 37b271e..afdd051 100644 --- a/backend/app/db/session.py +++ b/backend/app/db/session.py @@ -9,6 +9,7 @@ load_dotenv() # ----------------------------- # DATABASE URL # ----------------------------- + DATABASE_URL = os.getenv("DATABASE_URL") engine = create_engine( @@ -29,6 +30,7 @@ Base = declarative_base() # ----------------------------- # DB SESSION DEPENDENCY # ----------------------------- + def get_db(): db = SessionLocal() try: @@ -40,17 +42,25 @@ def get_db(): # ----------------------------- # INIT DB (CREA TABLAS) # ----------------------------- + def init_db(): + # Importa modelos para que SQLAlchemy los registre + import app.db.models # noqa # Crea todas las tablas si no existen + Base.metadata.create_all(bind=engine) # Pequeñas migraciones "best-effort" para entornos sin Alembic. # (create_all no altera tablas existentes) + _ensure_columns() +# --------------------------------------------------------- +# Función auxiliar: asegurar columnas existentes +# --------------------------------------------------------- def _ensure_columns(): insp = inspect(engine) diff --git a/backend/app/main.py b/backend/app/main.py index 1e5d6c8..d39e246 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,68 +1,154 @@ -from fastapi import Depends, FastAPI +""" +Entry point del backend FastAPI. + +Aplica un perfil de seguridad por defecto: +- Configuración tipada (Pydantic Settings) que falla rápido en producción. +- TrustedHostMiddleware (anti Host-header injection). +- CORS con lista blanca estricta (sin `*`). +- Body size limit (anti DoS por payload). +- Cabeceras de seguridad HTTP. +- Rate limiting (slowapi) con backend Redis si está configurado. +- Error handlers que NO filtran trazas ni internals. +""" + +from __future__ import annotations + +import logging + +from fastapi import Depends, FastAPI, HTTPException, Request +from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware +from fastapi.middleware.trustedhost import TrustedHostMiddleware +from slowapi.errors import RateLimitExceeded +from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.orm import Session -from app.db.session import init_db -from app.db.session import get_db -from app.api.researchers import router as researchers_router +from app.api.auth import complete_oauth_login_response, router as auth_router from app.api.export import router as export_router -from app.api.auth import router as auth_router -from app.api.auth import _complete_oauth_login -from app.schema.auth import OrcidLoginResponseSchema +from app.api.researchers import router as researchers_router +from app.core.body_size import BodySizeLimitMiddleware +from app.core.config import settings +from app.core.error_handlers import ( + http_exception_handler, + sqlalchemy_exception_handler, + unhandled_exception_handler, + validation_exception_handler, +) +from app.core.logging_config import configure_logging +from app.core.rate_limit import limiter, rate_limit_exceeded_handler +from app.core.security_headers import SecurityHeadersMiddleware +from app.db.session import get_db, init_db from app.scheduler.sync_scheduler import start_scheduler +from app.schema.auth import OrcidLoginResponseSchema + + +configure_logging() +logger = logging.getLogger("app.main") -# --------------------------------------------------------- -# Crear instancia principal de FastAPI -# --------------------------------------------------------- app = FastAPI( title="ORCID SWORD Backend", description="Backend para sincronización ORCID y exportación SWORD", - version="1.0.0" + version="1.0.0", + docs_url=settings.docs_url, + redoc_url=settings.redoc_url, + openapi_url=settings.openapi_url, ) # --------------------------------------------------------- -# Crear tablas al iniciar la aplicación +# Middlewares (orden importa: el último añadido es el más externo) # --------------------------------------------------------- + +app.state.limiter = limiter +app.add_exception_handler(RateLimitExceeded, rate_limit_exceeded_handler) + +app.add_middleware(SecurityHeadersMiddleware, settings=settings) + +app.add_middleware( + BodySizeLimitMiddleware, + max_bytes=settings.MAX_REQUEST_BODY_BYTES, +) + +app.add_middleware( + CORSMiddleware, + allow_origins=settings.CORS_ALLOWED_ORIGINS, + allow_credentials=True, + allow_methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"], + allow_headers=[ + "Authorization", + "Content-Type", + "Accept", + "Origin", + "X-Requested-With", + settings.API_KEY_NAME, + ], + expose_headers=["Content-Disposition", "X-RateLimit-Remaining", "X-RateLimit-Reset"], + max_age=600, +) + +app.add_middleware( + TrustedHostMiddleware, + allowed_hosts=settings.TRUSTED_HOSTS, +) + + +# --------------------------------------------------------- +# Exception handlers +# --------------------------------------------------------- + +app.add_exception_handler(HTTPException, http_exception_handler) +app.add_exception_handler(RequestValidationError, validation_exception_handler) +app.add_exception_handler(SQLAlchemyError, sqlalchemy_exception_handler) +app.add_exception_handler(Exception, unhandled_exception_handler) + + +# --------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------- + @app.on_event("startup") -def startup_event(): - init_db() # 🔥 CREA TABLAS - start_scheduler() # 🔥 INICIA SCHEDULER +def on_startup() -> None: + init_db() + start_scheduler() + logger.info( + "Backend ready (env=%s, docs=%s)", + settings.ENVIRONMENT, + bool(settings.DOCS_ENABLED), + ) # --------------------------------------------------------- # Healthcheck # --------------------------------------------------------- + @app.get("/health") -def health(): +def health() -> dict[str, str]: return {"status": "ok"} +# --------------------------------------------------------- +# Alias del callback OAuth (mismo flujo, mismo endurecimiento) +# --------------------------------------------------------- + @app.get("/callback", response_model=OrcidLoginResponseSchema) -def oauth_callback_root(code: str, db: Session = Depends(get_db)): +def oauth_callback_root( + request: Request, + code: str, + state: str | None = None, + db: Session = Depends(get_db), +): """ - Alias para probar redirect URIs como `https://127.0.0.1/callback` en local. - Intercambia el code con ORCID y emite el JWT. + Alias para integraciones que registran un redirect_uri tipo + `https:///callback` en ORCID. """ - return _complete_oauth_login(code=code, db=db) + return complete_oauth_login_response(request=request, code=code, state=state, db=db) # --------------------------------------------------------- -# Registrar routers +# Routers # --------------------------------------------------------- + app.include_router(researchers_router, prefix="/api") app.include_router(export_router, prefix="/api") app.include_router(auth_router, prefix="/api") - - -# --------------------------------------------------------- -# CORS -# --------------------------------------------------------- -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # en producción limitar - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) diff --git a/backend/app/scheduler/sync_scheduler.py b/backend/app/scheduler/sync_scheduler.py index 586e054..69ce594 100644 --- a/backend/app/scheduler/sync_scheduler.py +++ b/backend/app/scheduler/sync_scheduler.py @@ -9,9 +9,16 @@ import os # Cargar variables del .env load_dotenv() +# --------------------------------------------------------- +# Variables de entorno +# --------------------------------------------------------- + API_KEY = os.getenv("API_KEY_VALUE") BASE_URL = os.getenv("BASE_URL") +# --------------------------------------------------------- +# Función auxiliar: ejecutar sincronización mensual +# --------------------------------------------------------- def run_monthly_sync(): db = SessionLocal() @@ -36,6 +43,9 @@ def run_monthly_sync(): db.close() +# --------------------------------------------------------- +# Función auxiliar: iniciar el scheduler +# --------------------------------------------------------- def start_scheduler(): scheduler = BackgroundScheduler() diff --git a/backend/app/schema/auth.py b/backend/app/schema/auth.py index 869fde1..bd09626 100644 --- a/backend/app/schema/auth.py +++ b/backend/app/schema/auth.py @@ -1,11 +1,17 @@ from pydantic import BaseModel, Field +# --------------------------------------------------------- +# Modelo de solicitud de login OAuth +# --------------------------------------------------------- class OrcidLoginRequestSchema(BaseModel): # `code` is the authorization code returned by ORCID OAuth after the user signs in. # Exchanging it for tokens must happen server-side. code: str = Field(..., examples=["Q70Y3A"]) +# --------------------------------------------------------- +# Modelo de respuesta de login OAuth +# --------------------------------------------------------- class OrcidLoginResponseSchema(BaseModel): access_token: str diff --git a/backend/app/schema/export.py b/backend/app/schema/export.py new file mode 100644 index 0000000..18ef6f7 --- /dev/null +++ b/backend/app/schema/export.py @@ -0,0 +1,23 @@ +""" +Schemas de los endpoints de export. + +El backend recibe `pub_ids` como UUIDs en formato string. Pydantic ya los +valida y convierte; aquí además aplicamos un tope de tamaño para impedir +peticiones gigantes. +""" + +from __future__ import annotations + +from typing import List +from uuid import UUID + +from pydantic import BaseModel, Field + +from app.core.config import settings + + +class PublicationIdsRequestSchema(BaseModel): + pub_ids: List[UUID] = Field( + min_length=1, + max_length=settings.MAX_PUB_IDS_BATCH, + ) diff --git a/backend/app/schema/publication.py b/backend/app/schema/publication.py index a36c813..45bb473 100644 --- a/backend/app/schema/publication.py +++ b/backend/app/schema/publication.py @@ -3,6 +3,10 @@ from uuid import UUID from typing import Optional, List, Any from datetime import datetime +# --------------------------------------------------------- +# Modelo de publicación +# --------------------------------------------------------- + class PublicationSchema(BaseModel): id: UUID put_code: int | None = None diff --git a/backend/app/schema/researcher.py b/backend/app/schema/researcher.py index 2be69a4..8753bc6 100644 --- a/backend/app/schema/researcher.py +++ b/backend/app/schema/researcher.py @@ -1,13 +1,18 @@ -from pydantic import BaseModel, Field -from uuid import UUID -from typing import Optional, List, Dict from datetime import datetime +from typing import Dict, List, Optional +from uuid import UUID + +from pydantic import BaseModel, Field, field_validator + +from app.core.config import settings from app.schema.publication import PublicationSchema +from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid + class ResearcherSchema(BaseModel): id: UUID - orcid_id: str - name: Optional[str] + orcid_id: str = Field(min_length=19, max_length=19, pattern=ORCID_PATTERN) + name: Optional[str] = Field(default=None, max_length=255) authenticated: bool last_sync_at: Optional[datetime] @@ -33,7 +38,25 @@ class ResearcherWithPublicationsSchema(BaseModel): class ResearcherBatchSearchRequestSchema(BaseModel): - orcid_ids: List[str] = Field(min_length=1) + orcid_ids: List[str] = Field( + min_length=1, + max_length=settings.MAX_ORCID_BATCH, + ) + + @field_validator("orcid_ids") + @classmethod + def _validate_each(cls, value: List[str]) -> List[str]: + deduped: List[str] = [] + seen = set() + for v in value: + if not isinstance(v, str): + raise ValueError("ORCID iD debe ser string") + if not is_valid_orcid(v): + raise ValueError(f"ORCID iD inválido: {v}") + if v not in seen: + seen.add(v) + deduped.append(v) + return deduped class ResearcherSearchErrorSchema(BaseModel): diff --git a/backend/app/security/api_key.py b/backend/app/security/api_key.py index 7dc9197..c4b4336 100644 --- a/backend/app/security/api_key.py +++ b/backend/app/security/api_key.py @@ -1,43 +1,52 @@ -import os -from dotenv import load_dotenv +""" +Autenticación por API key (uso máquina-a-máquina, p. ej. el scheduler interno). + +Endurecimiento: +- Comparación constante en tiempo (`hmac.compare_digest`) para evitar timing attacks. +- No se loggea el valor de la cabecera bajo ninguna circunstancia. +- Se separa este mecanismo del JWT de usuario; la API key NO debe usarse como + prueba de identidad de un investigador. +""" + +from __future__ import annotations + +import hmac + from fastapi import Depends, HTTPException, status from fastapi.security import APIKeyHeader -# Cargar variables del .env -load_dotenv() - -API_KEY_NAME = os.getenv("API_KEY_NAME") -API_KEY_VALUE = os.getenv("API_KEY_VALUE") - -if not API_KEY_NAME: - raise RuntimeError("ERROR: La variable API_KEY_NAME no está definida en el .env") - -if not API_KEY_VALUE: - raise RuntimeError("ERROR: La variable API_KEY_VALUE no está definida en el .env") - -api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) +from app.core.config import settings -def get_api_key(api_key: str = Depends(api_key_header)): - if api_key != API_KEY_VALUE: +api_key_header = APIKeyHeader(name=settings.API_KEY_NAME, auto_error=False) + + +def _is_valid_key(provided: str | None) -> bool: + if not provided or not settings.API_KEY_VALUE: + return False + return hmac.compare_digest(provided.encode("utf-8"), settings.API_KEY_VALUE.encode("utf-8")) + + +def get_api_key(api_key: str | None = Depends(api_key_header)) -> str: + if not _is_valid_key(api_key): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, - detail="API key inválida o ausente." + detail="Invalid or missing API key", ) - return api_key + return api_key # type: ignore[return-value] -def get_api_key_optional(api_key: str = Depends(api_key_header)) -> str | None: +def get_api_key_optional(api_key: str | None = Depends(api_key_header)) -> str | None: """ - Devuelve la API key si está presente y es correcta. - - Si no está presente: None - - Si está presente pero incorrecta: 401 + - Si no llega cabecera: None. + - Si llega y es válida: la devuelve. + - Si llega pero es inválida: 401. """ if api_key is None: return None - if api_key != API_KEY_VALUE: + if not _is_valid_key(api_key): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, - detail="API key inválida." + detail="Invalid API key", ) return api_key diff --git a/backend/app/security/jwt.py b/backend/app/security/jwt.py index e8a930c..7edab3d 100644 --- a/backend/app/security/jwt.py +++ b/backend/app/security/jwt.py @@ -1,75 +1,138 @@ -import os +""" +Emisión y verificación de JWT. + +Endurecimiento aplicado: +- Sin fallback de secreto débil: si la configuración no es válida, falla al arranque. +- `iss` y `aud` obligatorios. +- `nbf` (not-before) y `iat` validados. +- `typ=access` para evitar mezclar tipos de token. +- Algoritmo fijo (no se acepta "none" ni cambios por payload). +- Errores opacos: nunca se expone el motivo del fallo de verificación al cliente. +""" + +from __future__ import annotations + from datetime import datetime, timedelta, timezone from typing import Any +from uuid import uuid4 -from fastapi import Depends, HTTPException, status +from fastapi import Depends, HTTPException, Request, status from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from jose import JWTError, jwt from sqlalchemy.orm import Session -from dotenv import load_dotenv +from app.core.config import settings from app.db.models import Researcher from app.db.session import get_db - -load_dotenv() +from app.utils.orcid_validator import is_valid_orcid _bearer = HTTPBearer(auto_error=False) -def _settings() -> tuple[str, str, int]: - # Fallback de desarrollo para evitar 500 por configuración ausente. - secret = os.getenv("JWT_SECRET") or "change_me" - algorithm = os.getenv("JWT_ALGORITHM") or "HS256" - expires_minutes = int(os.getenv("JWT_EXPIRES_MINUTES") or "720") - return secret, algorithm, expires_minutes - - def create_access_token(*, subject: str, extra: dict[str, Any] | None = None) -> str: - secret, algorithm, expires_minutes = _settings() + """ + Emite un access token firmado con HS256 (configurable). + + `subject` debe ser el ORCID iD verificado del investigador. + """ + if not is_valid_orcid(subject): + raise ValueError("subject must be a valid ORCID iD") + now = datetime.now(timezone.utc) payload: dict[str, Any] = { + "iss": settings.JWT_ISSUER, + "aud": settings.JWT_AUDIENCE, "sub": subject, "iat": int(now.timestamp()), - "exp": int((now + timedelta(minutes=expires_minutes)).timestamp()), + "nbf": int(now.timestamp()), + "exp": int((now + timedelta(minutes=settings.JWT_EXPIRES_MINUTES)).timestamp()), + "jti": uuid4().hex, + "typ": "access", } if extra: + for reserved in ("iss", "aud", "sub", "iat", "nbf", "exp", "jti", "typ"): + extra.pop(reserved, None) payload.update(extra) - return jwt.encode(payload, secret, algorithm=algorithm) + + return jwt.encode(payload, settings.JWT_SECRET, algorithm=settings.JWT_ALGORITHM) + + +def _decode_token(token: str) -> dict[str, Any]: + try: + return jwt.decode( + token, + settings.JWT_SECRET, + algorithms=[settings.JWT_ALGORITHM], + audience=settings.JWT_AUDIENCE, + issuer=settings.JWT_ISSUER, + options={ + "require_iat": True, + "require_nbf": True, + "require_exp": True, + "require_aud": True, + "require_iss": True, + }, + ) + except JWTError as exc: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired token", + headers={"WWW-Authenticate": "Bearer"}, + ) from exc def get_current_researcher( - creds: HTTPAuthorizationCredentials = Depends(_bearer), + request: Request, + creds: HTTPAuthorizationCredentials | None = Depends(_bearer), db: Session = Depends(get_db), ) -> Researcher: if not creds or not creds.credentials: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing bearer token") + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Missing bearer token", + headers={"WWW-Authenticate": "Bearer"}, + ) - secret, algorithm, _ = _settings() - try: - payload = jwt.decode(creds.credentials, secret, algorithms=[algorithm]) - except JWTError: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid token") + payload = _decode_token(creds.credentials) + + if payload.get("typ") != "access": + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid token type", + headers={"WWW-Authenticate": "Bearer"}, + ) orcid_id = payload.get("sub") - if not isinstance(orcid_id, str) or not orcid_id: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid token subject") + if not isinstance(orcid_id, str) or not is_valid_orcid(orcid_id): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid token subject", + headers={"WWW-Authenticate": "Bearer"}, + ) researcher = db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first() if not researcher or not researcher.authenticated: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Researcher not authenticated") + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Researcher not authenticated", + headers={"WWW-Authenticate": "Bearer"}, + ) + + request.state.researcher = researcher return researcher def get_optional_current_researcher( - creds: HTTPAuthorizationCredentials = Depends(_bearer), + request: Request, + creds: HTTPAuthorizationCredentials | None = Depends(_bearer), db: Session = Depends(get_db), ) -> Researcher | None: """ - Devuelve el investigador autenticado si hay Bearer token. - Si no hay token, devuelve None. - Si hay token inválido, lanza 401. + Devuelve el investigador autenticado si hay Bearer válido. + Si no hay Bearer, devuelve None. + Si hay Bearer inválido, lanza 401 (no se acepta como anónimo). """ if not creds or not creds.credentials: return None - return get_current_researcher(creds=creds, db=db) + return get_current_researcher(request=request, creds=creds, db=db) diff --git a/backend/app/security/oauth_state.py b/backend/app/security/oauth_state.py new file mode 100644 index 0000000..92475b8 --- /dev/null +++ b/backend/app/security/oauth_state.py @@ -0,0 +1,76 @@ +""" +OAuth state anti-CSRF para el flujo de login con ORCID. + +El parámetro `state` se genera en `/auth/orcid/authorize`, se guarda en una +cookie HttpOnly + SameSite=Lax con TTL corto, y se valida en el callback. + +Si el `state` falta, no coincide o ha expirado, el login se rechaza. +""" + +from __future__ import annotations + +import hmac +import secrets +from datetime import datetime, timezone + +from fastapi import HTTPException, status +from starlette.requests import Request +from starlette.responses import Response + +from app.core.config import settings + + +_STATE_BYTES = 32 + + +def generate_state() -> str: + return secrets.token_urlsafe(_STATE_BYTES) + + +def attach_state_cookie(response: Response, state: str) -> None: + """ + Persiste el `state` en una cookie segura y devuelve el valor crudo. + """ + response.set_cookie( + key=settings.ORCID_OAUTH_STATE_COOKIE, + value=state, + max_age=settings.ORCID_OAUTH_STATE_TTL_SECONDS, + secure=settings.is_production, + httponly=True, + samesite="lax", + path="/", + ) + + +def clear_state_cookie(response: Response) -> None: + response.delete_cookie( + key=settings.ORCID_OAUTH_STATE_COOKIE, + path="/", + ) + + +def validate_state(request: Request, received_state: str | None) -> None: + """ + Compara el state recibido en el callback con el almacenado en cookie. + + Lanza 400 si no coincide o falta. Comparación en tiempo constante. + """ + if not settings.ORCID_OAUTH_STATE_ENABLED: + return + + cookie_value = request.cookies.get(settings.ORCID_OAUTH_STATE_COOKIE) + if not cookie_value or not received_state: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="OAuth state missing", + ) + + if not hmac.compare_digest(cookie_value.encode("utf-8"), received_state.encode("utf-8")): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="OAuth state mismatch", + ) + + +def now_ts() -> int: + return int(datetime.now(timezone.utc).timestamp()) diff --git a/backend/app/services/normalizer.py b/backend/app/services/normalizer.py index fbe41bb..c9e9688 100644 --- a/backend/app/services/normalizer.py +++ b/backend/app/services/normalizer.py @@ -1,5 +1,8 @@ from typing import List +# --------------------------------------------------------- +# Función auxiliar: obtener valor de un diccionario +# --------------------------------------------------------- def _get(d: dict | None, *keys, default=None): cur = d or {} @@ -11,6 +14,9 @@ def _get(d: dict | None, *keys, default=None): return default return cur +# --------------------------------------------------------- +# Clase de normalización de publicaciones +# --------------------------------------------------------- class PublicationNormalizer: @staticmethod diff --git a/backend/app/services/orcid_client.py b/backend/app/services/orcid_client.py index f0535dd..c143aeb 100644 --- a/backend/app/services/orcid_client.py +++ b/backend/app/services/orcid_client.py @@ -14,8 +14,14 @@ BASE_URL_SANDBOX = "https://pub.sandbox.orcid.org/v3.0" # TOKEN_URL_PROD = "https://orcid.org/oauth/token" # BASE_URL_PROD = "https://pub.orcid.org/v3.0" +# --------------------------------------------------------- +# Clase de cliente de ORCID +# --------------------------------------------------------- class ORCIDClient: + # --------------------------------------------------------- + # Función auxiliar: inicializar el cliente de ORCID + # --------------------------------------------------------- def __init__(self): # Asegura que al ejecutar `uvicorn` local también se carga `backend/.env`. # (En docker `ORCID_REDIRECT_URI` y secretos llegan por env_file, así que esto no molesta.) @@ -115,6 +121,10 @@ class ORCIDClient: params["state"] = state return f"{self.authorization_url}?{urllib.parse.urlencode(params)}" + # --------------------------------------------------------- + # Función auxiliar: intercambiar código de autorización + # --------------------------------------------------------- + def exchange_authorization_code( self, *, diff --git a/backend/app/services/sword_generator.py b/backend/app/services/sword_generator.py index a6a0f58..b1ce806 100644 --- a/backend/app/services/sword_generator.py +++ b/backend/app/services/sword_generator.py @@ -6,6 +6,9 @@ ATOM_NS = "http://www.w3.org/2005/Atom" DC_NS = "http://purl.org/dc/elements/1.1/" EXTRA_NS = "http://example.org/orcid-extra" # namespace para campos extendidos +# --------------------------------------------------------- +# Clase de generador de feed SWORD +# --------------------------------------------------------- class SWORDGenerator: diff --git a/backend/app/services/sync_service.py b/backend/app/services/sync_service.py index baf4b23..911d048 100644 --- a/backend/app/services/sync_service.py +++ b/backend/app/services/sync_service.py @@ -8,12 +8,23 @@ from app.db.repositories.researcher_repository import ResearcherRepository from app.db.repositories.publication_repository import PublicationRepository from app.db.repositories.syncjob_repository import SyncJobRepository +# --------------------------------------------------------- +# Clase de servicio de sincronización +# --------------------------------------------------------- class SyncService: + # --------------------------------------------------------- + # Función auxiliar: inicializar el servicio de sincronización + # --------------------------------------------------------- + def __init__(self): self.orcid_client = ORCIDClient() + # --------------------------------------------------------- + # Función auxiliar: sincronizar las publicaciones de un investigador + # --------------------------------------------------------- + def sync_researcher(self, db: Session, orcid_id: str): """ Sincroniza las publicaciones de un investigador con manejo robusto de errores. @@ -109,6 +120,10 @@ class SyncService: "total": new_records + updated_records } + # --------------------------------------------------------- + # Función auxiliar: sincronizar y obtener investigador + publicaciones + # --------------------------------------------------------- + def sync_and_get_full(self, db: Session, orcid_id: str): """ Sincroniza (si es necesario) y devuelve investigador + publicaciones. diff --git a/backend/app/services/zip_generator.py b/backend/app/services/zip_generator.py index f37e8fc..e0ed31b 100644 --- a/backend/app/services/zip_generator.py +++ b/backend/app/services/zip_generator.py @@ -7,12 +7,16 @@ from xml.etree.ElementTree import Element, SubElement, tostring from app.db.models import Publication, Researcher from app.services.sword_generator import SWORDGenerator +# --------------------------------------------------------- +# Clase de generador de ZIP +# --------------------------------------------------------- class ZIPGenerator: # --------------------------------------------------------- - # MANIFEST.TXT — más completo + # Función auxiliar: generar manifest.txt # --------------------------------------------------------- + @staticmethod def generate_manifest(researcher, publications): lines = [ diff --git a/backend/app/utils/orcid_validator.py b/backend/app/utils/orcid_validator.py index 235a88b..7eb9f4d 100644 --- a/backend/app/utils/orcid_validator.py +++ b/backend/app/utils/orcid_validator.py @@ -2,27 +2,38 @@ import re ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$") +ORCID_PATTERN = r"^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$" -def is_valid_orcid(orcid_id: str) -> bool: + +def is_valid_orcid(orcid_id: str | None) -> bool: """ Valida un ORCID ID: - Formato: 0000-0000-0000-0000 - Dígito de control según ISO 7064 Mod 11-2 """ + if not isinstance(orcid_id, str): + return False if not ORCID_REGEX.match(orcid_id): return False - # Quitar guiones digits = orcid_id.replace("-", "") total = 0 - # Los primeros 15 dígitos for char in digits[:-1]: total = (total + int(char)) * 2 - # Resto remainder = total % 11 result = (12 - remainder) % 11 check_digit = "X" if result == 10 else str(result) return digits[-1] == check_digit + + +def assert_valid_orcid(orcid_id: str) -> str: + """ + Devuelve el ORCID si es válido. Lanza ValueError si no. + Útil para usar como Pydantic validator. + """ + if not is_valid_orcid(orcid_id): + raise ValueError("ORCID iD inválido") + return orcid_id diff --git a/backend/requirements.txt b/backend/requirements.txt index 39dcb09..9c4863f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,14 +1,16 @@ fastapi -uvicorn +uvicorn[standard] sqlalchemy psycopg2-binary httpx pydantic +pydantic-settings python-dotenv lxml -apscheduler +defusedxml +APScheduler==3.10.4 authlib redis -APScheduler==3.10.4 requests python-jose[cryptography] +slowapi diff --git a/docker-compose.yml b/docker-compose.yml index da14276..382f464 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,42 +3,56 @@ services: backend: build: ./backend container_name: orcid-backend - restart: always + restart: unless-stopped ports: - - "8000:8000" + - "127.0.0.1:8000:8000" env_file: - ./backend/.env environment: DATABASE_URL: postgresql://postgres:postgres@db:5432/orcid_db REDIS_URL: redis://redis:6379/0 - ORCID_REDIRECT_URI: https://jargon-supreme-palpable.ngrok-free.dev/callback depends_on: db: condition: service_healthy redis: condition: service_started + read_only: true + tmpfs: + - /tmp + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8000/health"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 15s frontend: build: ./frontend container_name: orcid-frontend - restart: always + restart: unless-stopped ports: - - "5173:5173" + - "127.0.0.1:5173:5173" depends_on: - backend env_file: - ./frontend/.env + security_opt: + - no-new-privileges:true db: image: postgres:16 container_name: orcid-postgres - restart: always + restart: unless-stopped environment: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres POSTGRES_DB: orcid_db - ports: - - "5432:5432" + expose: + - "5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: @@ -46,13 +60,18 @@ services: interval: 2s timeout: 3s retries: 20 + security_opt: + - no-new-privileges:true redis: image: redis:7 container_name: orcid-redis - restart: always - ports: - - "6379:6379" + restart: unless-stopped + command: ["redis-server", "--save", "60", "1", "--loglevel", "warning"] + expose: + - "6379" + security_opt: + - no-new-privileges:true volumes: postgres_data: From 1dd1096744cb4a53d3d294b9ad1e7a8407fd8a9b Mon Sep 17 00:00:00 2001 From: Mireya Cueto Garrido Date: Fri, 8 May 2026 12:13:05 +0200 Subject: [PATCH 2/2] feat: enhance error handling and configuration in backend - Added ORCID_REDIRECT_URI to docker-compose for OAuth callback. - Refactored CORS and trusted hosts settings in configuration for better clarity. - Introduced a new function to validate publication IDs and provide explicit error messages for researcher IDs. - Updated rate limiting strategy to simplify configuration. - Improved security headers middleware to safely remove sensitive headers. --- backend/app/api/export.py | 23 ++++++++++++++++++ backend/app/api/researchers.py | 15 +++--------- backend/app/core/config.py | 35 ++++++++++++++-------------- backend/app/core/rate_limit.py | 4 ++-- backend/app/core/security_headers.py | 23 ++++-------------- backend/app/main.py | 4 ++-- docker-compose.yml | 1 + 7 files changed, 54 insertions(+), 51 deletions(-) diff --git a/backend/app/api/export.py b/backend/app/api/export.py index 7e20fd5..c3a9a6a 100644 --- a/backend/app/api/export.py +++ b/backend/app/api/export.py @@ -63,6 +63,27 @@ def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]: return pub_ids +def _raise_clear_error_if_researcher_id_was_used(db: Session, pub_ids: List[UUID]) -> None: + """ + Si el cliente envía por error el UUID de un investigador al endpoint + de publicaciones, devolvemos un mensaje explícito para guiar el uso. + """ + if len(pub_ids) != 1: + return + + researcher = db.query(Researcher).filter(Researcher.id == pub_ids[0]).first() + if researcher: + raise HTTPException( + status_code=400, + detail=( + "The provided UUID belongs to a researcher, not a publication. " + "Use publication IDs for this endpoint, or call " + f"/api/export/sword/researcher/{researcher.orcid_id} " + f"(or /api/export/zip/researcher/{researcher.orcid_id})." + ), + ) + + # --------------------------------------------------------- # ENDPOINT 1: SWORD múltiples publicaciones # --------------------------------------------------------- @@ -81,6 +102,7 @@ async def export_multiple_sword( pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all() if not pubs: + _raise_clear_error_if_researcher_id_was_used(db, pub_ids) raise HTTPException(status_code=404, detail="No publications found") researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first() @@ -142,6 +164,7 @@ async def export_multiple_zip( pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all() if not pubs: + _raise_clear_error_if_researcher_id_was_used(db, pub_ids) raise HTTPException(status_code=404, detail="No publications found") researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first() diff --git a/backend/app/api/researchers.py b/backend/app/api/researchers.py index c8df04d..82859df 100644 --- a/backend/app/api/researchers.py +++ b/backend/app/api/researchers.py @@ -17,7 +17,7 @@ from app.schema.researcher import ( ResearcherStatsSchema, ResearcherWithPublicationsSchema, ) -from app.security.jwt import get_current_researcher, get_optional_current_researcher +from app.security.jwt import get_optional_current_researcher from app.services.normalizer import PublicationNormalizer from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid @@ -184,22 +184,13 @@ def build_search_response(orcid_id: str, db: Session, current: Researcher | None # ENDPOINT 1: SEARCH + SYNC # --------------------------------------------------------- -def _search_rate_limit(request: Request) -> str: - """ - Aplica un límite distinto si el usuario está autenticado. - Como SlowAPI evalúa el decorador antes de las dependencias, devolvemos - el límite más restrictivo y subimos sólo si hay token (state.researcher). - """ - researcher = getattr(request.state, "researcher", None) - return settings.RATE_LIMIT_SEARCH_AUTH if researcher else settings.RATE_LIMIT_SEARCH_ANON - @router.post( "/search", response_model=ResearcherBatchSearchResponseSchema, response_model_exclude_none=True, ) -@limiter.limit(_search_rate_limit) +@limiter.limit(settings.RATE_LIMIT_SEARCH_ANON) def search_and_sync_researchers( request: Request, payload: ResearcherBatchSearchRequestSchema, @@ -261,7 +252,7 @@ def sync_researcher( request: Request, orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN), db: Session = Depends(get_db), - current: Researcher = Depends(get_current_researcher), + current: Researcher | None = Depends(get_optional_current_researcher), ): if not is_valid_orcid(orcid_id): raise HTTPException(status_code=400, detail="Invalid ORCID iD") diff --git a/backend/app/core/config.py b/backend/app/core/config.py index f69d92e..b77834f 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -71,9 +71,9 @@ class Settings(BaseSettings): ORCID_OAUTH_STATE_COOKIE: str = "orcid_oauth_state" ORCID_OAUTH_STATE_TTL_SECONDS: int = 600 - CORS_ALLOWED_ORIGINS: List[str] = Field(default_factory=list) + CORS_ALLOWED_ORIGINS: str = "" - TRUSTED_HOSTS: List[str] = Field(default_factory=lambda: ["*"]) + TRUSTED_HOSTS: str = "*" RATE_LIMIT_DEFAULT: str = "60/minute" RATE_LIMIT_AUTH: str = "10/minute" @@ -92,19 +92,11 @@ class Settings(BaseSettings): SECURITY_HSTS_INCLUDE_SUBDOMAINS: bool = True SECURITY_HSTS_PRELOAD: bool = False - @field_validator("CORS_ALLOWED_ORIGINS", mode="before") - @classmethod - def _parse_cors(cls, v): - return _split_csv(v) - - @field_validator("TRUSTED_HOSTS", mode="before") - @classmethod - def _parse_trusted_hosts(cls, v): - parsed = _split_csv(v) if not isinstance(v, list) else v - return parsed or ["*"] - @model_validator(mode="after") def _validate_security(self) -> "Settings": + cors_origins = self.cors_allowed_origins + trusted_hosts = self.trusted_hosts + if self.ENVIRONMENT == "production": weak = {"change_me", "changeme", "secret", "password", ""} if self.JWT_SECRET.strip().lower() in weak: @@ -116,11 +108,11 @@ class Settings(BaseSettings): raise ValueError( "JWT_SECRET debe tener al menos 32 caracteres en producción." ) - if "*" in self.CORS_ALLOWED_ORIGINS: + if "*" in cors_origins: raise ValueError( "CORS_ALLOWED_ORIGINS no puede contener '*' en producción." ) - if not self.CORS_ALLOWED_ORIGINS: + if not cors_origins: raise ValueError( "CORS_ALLOWED_ORIGINS debe definirse explícitamente en producción." ) @@ -128,12 +120,12 @@ class Settings(BaseSettings): raise ValueError( "API_KEY_VALUE debe tener al menos 24 caracteres en producción." ) - if self.TRUSTED_HOSTS == ["*"]: + if trusted_hosts == ["*"]: raise ValueError( "TRUSTED_HOSTS debe definirse explícitamente en producción." ) - for origin in self.CORS_ALLOWED_ORIGINS: + for origin in cors_origins: parsed = urlparse(origin) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise ValueError(f"Origen CORS inválido: {origin!r}") @@ -144,6 +136,15 @@ class Settings(BaseSettings): def is_production(self) -> bool: return self.ENVIRONMENT == "production" + @property + def cors_allowed_origins(self) -> List[str]: + return _split_csv(self.CORS_ALLOWED_ORIGINS) + + @property + def trusted_hosts(self) -> List[str]: + parsed = _split_csv(self.TRUSTED_HOSTS) + return parsed or ["*"] + @property def docs_url(self) -> str | None: return "/docs" if self.DOCS_ENABLED else None diff --git a/backend/app/core/rate_limit.py b/backend/app/core/rate_limit.py index d216609..92b2e82 100644 --- a/backend/app/core/rate_limit.py +++ b/backend/app/core/rate_limit.py @@ -39,8 +39,8 @@ def _build_limiter() -> Limiter: key_func=_key_func, default_limits=[settings.RATE_LIMIT_DEFAULT], storage_uri=storage_uri, - headers_enabled=True, - strategy="fixed-window-elastic-expiry", + headers_enabled=False, + strategy="fixed-window", ) diff --git a/backend/app/core/security_headers.py b/backend/app/core/security_headers.py index 9de3eff..18742c9 100644 --- a/backend/app/core/security_headers.py +++ b/backend/app/core/security_headers.py @@ -1,19 +1,3 @@ -""" -Middleware de cabeceras de seguridad HTTP. - -Aplica un perfil seguro por defecto: -- Strict-Transport-Security (HSTS) — fuerza HTTPS en navegadores compatibles. -- X-Content-Type-Options: nosniff -- X-Frame-Options: DENY (clickjacking) -- Referrer-Policy: strict-origin-when-cross-origin -- Permissions-Policy: bloquea APIs sensibles por defecto -- Cross-Origin-Opener-Policy / Resource-Policy: aislamiento del navegador -- Content-Security-Policy laxa para Swagger/OpenAPI (CDN), restrictiva para el resto. - -NOTA: El frontend SPA tiene su propia CSP en su servidor. Aquí -endurecemos lo que sirve el backend (JSON, XML, ZIP, /docs, /redoc, etc.). -""" - from __future__ import annotations from starlette.middleware.base import BaseHTTPMiddleware @@ -82,7 +66,10 @@ class SecurityHeadersMiddleware(BaseHTTPMiddleware): hsts += "; preload" response.headers.setdefault("Strict-Transport-Security", hsts) - response.headers.pop("Server", None) - response.headers.pop("X-Powered-By", None) + # `MutableHeaders` no implementa `.pop()`. Eliminamos de forma segura. + if "server" in response.headers: + del response.headers["server"] + if "x-powered-by" in response.headers: + del response.headers["x-powered-by"] return response diff --git a/backend/app/main.py b/backend/app/main.py index d39e246..fe98b86 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -72,7 +72,7 @@ app.add_middleware( app.add_middleware( CORSMiddleware, - allow_origins=settings.CORS_ALLOWED_ORIGINS, + allow_origins=settings.cors_allowed_origins, allow_credentials=True, allow_methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"], allow_headers=[ @@ -89,7 +89,7 @@ app.add_middleware( app.add_middleware( TrustedHostMiddleware, - allowed_hosts=settings.TRUSTED_HOSTS, + allowed_hosts=settings.trusted_hosts, ) diff --git a/docker-compose.yml b/docker-compose.yml index 382f464..a0ea598 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,7 @@ services: environment: DATABASE_URL: postgresql://postgres:postgres@db:5432/orcid_db REDIS_URL: redis://redis:6379/0 + ORCID_REDIRECT_URI: https://jargon-supreme-palpable.ngrok-free.dev/callback depends_on: db: condition: service_healthy