feat: enhance backend security and configuration

- Updated Dockerfile to improve security with a non-root user and added health checks.
- Modified docker-compose.yml to set containers as read-only, restrict ports to localhost, and implement health checks.
- Enhanced .env.example with additional environment variables for security and configuration.
- Improved FastAPI application with middleware for security headers, CORS, and body size limits.
- Refactored authentication flow in auth.py to include state validation and improved error handling.
- Added rate limiting to various endpoints to prevent abuse.
- Updated researcher and publication handling to ensure better validation and error management.
This commit is contained in:
Mireya Cueto Garrido
2026-05-08 11:19:52 +02:00
parent 96e58dbd16
commit af1b8e9956
37 changed files with 1375 additions and 282 deletions
+74 -38
View File
@@ -1,64 +1,68 @@
import logging
import httpx
import os
from pathlib import Path
from dotenv import load_dotenv
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.responses import RedirectResponse
from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi.responses import JSONResponse, RedirectResponse
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.rate_limit import limiter
from app.db.models import Researcher
from app.db.session import get_db
from app.schema.auth import OrcidLoginResponseSchema
from app.security.jwt import create_access_token
from app.security.oauth_state import (
attach_state_cookie,
clear_state_cookie,
generate_state,
validate_state,
)
from app.services.orcid_client import ORCIDClient
from app.utils.orcid_validator import is_valid_orcid
# Asegura que al ejecutar `uvicorn` local también se carga `backend/.env`.
_ENV_PATH = Path(__file__).resolve().parents[2] / ".env"
load_dotenv(dotenv_path=_ENV_PATH, override=False)
router = APIRouter(prefix="/auth", tags=["auth"])
logger = logging.getLogger("app.auth")
def _extract_display_name(record: dict) -> str | None:
person = (record or {}).get("person") or {}
name = person.get("name") or {}
given = ((name.get("given-names") or {}).get("value")) if isinstance(name.get("given-names"), dict) else None
family = ((name.get("family-name") or {}).get("value")) if isinstance(name.get("family-name"), dict) else None
full = " ".join([p for p in [given, family] if p])
given_obj = name.get("given-names")
family_obj = name.get("family-name")
given = given_obj.get("value") if isinstance(given_obj, dict) else None
family = family_obj.get("value") if isinstance(family_obj, dict) else None
full = " ".join(p for p in [given, family] if p)
return full or None
def _orcid_redirect_uri() -> str:
# Debe coincidir con el `redirect_uri` registrado en tu integración ORCID.
return os.getenv("ORCID_REDIRECT_URI") or "http://localhost:8000/api/auth/orcid/callback"
return settings.ORCID_REDIRECT_URI
def _complete_oauth_login(*, code: str, db: Session) -> OrcidLoginResponseSchema:
"""
Completa el login OAuth:
1) intercambio del `code` en ORCID (server-side)
2) crea/actualiza el investigador
3) emite nuestro JWT
1) Intercambia el `code` con ORCID (server-side).
2) Crea/actualiza el investigador.
3) Emite el JWT propio.
"""
if not code:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Missing ORCID authorization code")
if not code or len(code) > 256:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid ORCID authorization code")
client = ORCIDClient()
redirect_uri = _orcid_redirect_uri()
try:
token_data = client.exchange_authorization_code(code=code, redirect_uri=redirect_uri)
token_data = client.exchange_authorization_code(code=code, redirect_uri=_orcid_redirect_uri())
except httpx.HTTPStatusError as exc:
logger.warning("ORCID token exchange failed: %s", exc.response.status_code)
raise HTTPException(
status_code=status.HTTP_502_BAD_GATEWAY,
detail=f"ORCID token error ({exc.response.status_code})",
)
except httpx.TimeoutException:
raise HTTPException(status_code=status.HTTP_504_GATEWAY_TIMEOUT, detail="ORCID timeout")
except Exception:
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="ORCID unavailable")
detail="ORCID token exchange failed",
) from exc
except httpx.TimeoutException as exc:
raise HTTPException(status_code=status.HTTP_504_GATEWAY_TIMEOUT, detail="ORCID timeout") from exc
except Exception as exc:
logger.exception("Unexpected error during ORCID token exchange")
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail="ORCID unavailable") from exc
orcid_id = (token_data.get("orcid") or "").strip()
if not is_valid_orcid(orcid_id):
@@ -66,7 +70,6 @@ def _complete_oauth_login(*, code: str, db: Session) -> OrcidLoginResponseSchema
display_name = token_data.get("name")
if not display_name:
# Fallback si ORCID no devuelve `name` en el token response.
try:
record = client.fetch_record(orcid_id)
display_name = _extract_display_name(record)
@@ -89,21 +92,54 @@ def _complete_oauth_login(*, code: str, db: Session) -> OrcidLoginResponseSchema
return OrcidLoginResponseSchema(access_token=token)
@router.get("/orcid/authorize")
def authorize_orcid():
def complete_oauth_login_response(
*, request: Request, code: str, state: str | None, db: Session
) -> JSONResponse:
"""
Inicia el flujo OAuth 3-legged (authorization code) hacia ORCID.
Valida `state`, completa el login y limpia la cookie del state.
Devuelve directamente la JSONResponse (para poder borrar cookie).
"""
validate_state(request, state)
payload = _complete_oauth_login(code=code, db=db)
json_resp = JSONResponse(content=payload.model_dump())
clear_state_cookie(json_resp)
return json_resp
# ---------------------------------------------------------
# ENDPOINT 1: Iniciar flujo OAuth 3-legged hacia ORCID
# ---------------------------------------------------------
@router.get("/orcid/authorize")
@limiter.limit(settings.RATE_LIMIT_AUTH)
def authorize_orcid(request: Request):
"""
Genera la URL de autorización ORCID y persiste el `state` en cookie
HttpOnly para validarlo en el callback (anti-CSRF).
"""
client = ORCIDClient()
state = generate_state() if settings.ORCID_OAUTH_STATE_ENABLED else None
authorize_url = client.build_authorize_url(
redirect_uri=_orcid_redirect_uri(),
# Solo necesitamos el Authenticated iD del usuario.
scope="/authenticate",
state=state,
)
return RedirectResponse(authorize_url)
response = RedirectResponse(authorize_url)
if state:
attach_state_cookie(response, state)
return response
# ---------------------------------------------------------
# ENDPOINT 2: Callback OAuth 3-legged desde ORCID
# ---------------------------------------------------------
@router.get("/orcid/callback", response_model=OrcidLoginResponseSchema)
def orcid_callback(code: str, db: Session = Depends(get_db)):
return _complete_oauth_login(code=code, db=db)
@limiter.limit(settings.RATE_LIMIT_AUTH)
def orcid_callback(
request: Request,
code: str,
state: str | None = None,
db: Session = Depends(get_db),
):
return complete_oauth_login_response(request=request, code=code, state=state, db=db)
+100 -82
View File
@@ -1,115 +1,146 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from sqlalchemy.orm import Session
from typing import Iterable, List
from uuid import UUID
from fastapi import APIRouter, Body, Depends, HTTPException, Path, Request
from fastapi.responses import Response
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.rate_limit import limiter
from app.db.models import Publication, PublicationDownload, Researcher
from app.db.session import get_db
from app.db.models import Publication, Researcher, PublicationDownload
from app.security.api_key import get_api_key_optional
from app.security.jwt import get_optional_current_researcher
from app.services.sword_generator import SWORDGenerator
from app.services.zip_generator import ZIPGenerator
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
router = APIRouter(prefix="/export")
def validate_uuid_list(pub_ids: list[str]) -> list[UUID]:
valid_ids = []
for pid in pub_ids:
try:
valid_ids.append(UUID(pid))
except Exception:
raise HTTPException(
status_code=400,
detail=f"Invalid publication ID (not UUID): {pid}"
)
return valid_ids
def _ensure_credentials(api_key: str | None, current: Researcher | None) -> None:
if not api_key and not current:
raise HTTPException(status_code=401, detail="Authentication required")
def _record_downloads(db: Session, current: Researcher, pubs: Iterable[Publication]) -> None:
"""
Inserta marcadores de descarga (researcher_id, publication_id).
- Resuelve descargas existentes con UNA sola query.
- Solo añade las que faltan.
"""
pub_ids = [p.id for p in pubs]
if not pub_ids:
return
existing_ids = {
row[0]
for row in (
db.query(PublicationDownload.publication_id)
.filter(
PublicationDownload.researcher_id == current.id,
PublicationDownload.publication_id.in_(pub_ids),
)
.all()
)
}
new_rows = [
PublicationDownload(researcher_id=current.id, publication_id=pid)
for pid in pub_ids
if pid not in existing_ids
]
if new_rows:
db.add_all(new_rows)
db.commit()
def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]:
if len(pub_ids) > settings.MAX_PUB_IDS_BATCH:
raise HTTPException(status_code=413, detail="Too many publication IDs")
return pub_ids
# ---------------------------------------------------------
# ENDPOINT 1: SWORD múltiples publicaciones
# ---------------------------------------------------------
@router.post("/sword/publications")
@limiter.limit(settings.RATE_LIMIT_EXPORT)
async def export_multiple_sword(
pub_ids: list[str],
request: Request,
pub_ids: List[UUID] = Body(..., min_length=1, max_length=settings.MAX_PUB_IDS_BATCH),
db: Session = Depends(get_db),
api_key: str | None = Depends(get_api_key_optional),
current: Researcher | None = Depends(get_optional_current_researcher),
):
if not api_key and not current:
raise HTTPException(status_code=401, detail="Missing credentials")
validate_uuid_list(pub_ids)
_ensure_credentials(api_key, current)
_validate_pub_ids(pub_ids)
pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found")
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
# Registrar descarga solo si hay usuario logueado
if current:
for p in pubs:
exists = (
db.query(PublicationDownload)
.filter(
PublicationDownload.researcher_id == current.id,
PublicationDownload.publication_id == p.id,
)
.first()
)
if not exists:
db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id))
db.commit()
_record_downloads(db, current, pubs)
return Response(content=xml_bytes, media_type="application/xml")
# ---------------------------------------------------------
# ENDPOINT 2: SWORD por investigador
# ---------------------------------------------------------
@router.get("/sword/researcher/{orcid_id}")
@limiter.limit(settings.RATE_LIMIT_EXPORT)
async def export_researcher_sword(
orcid_id: str,
request: Request,
orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN),
db: Session = Depends(get_db),
api_key: str | None = Depends(get_api_key_optional),
current: Researcher | None = Depends(get_optional_current_researcher),
):
if not api_key and not current:
raise HTTPException(status_code=401, detail="Missing credentials")
_ensure_credentials(api_key, current)
if not is_valid_orcid(orcid_id):
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = db.query(Publication).filter_by(researcher_id=researcher.id).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found for this researcher")
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
if current:
for p in pubs:
exists = (
db.query(PublicationDownload)
.filter(
PublicationDownload.researcher_id == current.id,
PublicationDownload.publication_id == p.id,
)
.first()
)
if not exists:
db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id))
db.commit()
_record_downloads(db, current, pubs)
return Response(content=xml_bytes, media_type="application/xml")
# ---------------------------------------------------------
# ENDPOINT 3: ZIP múltiples publicaciones
# ---------------------------------------------------------
@router.post("/zip/publications")
@limiter.limit(settings.RATE_LIMIT_EXPORT)
async def export_multiple_zip(
pub_ids: list[str],
request: Request,
pub_ids: List[UUID] = Body(..., min_length=1, max_length=settings.MAX_PUB_IDS_BATCH),
db: Session = Depends(get_db),
api_key: str | None = Depends(get_api_key_optional),
current: Researcher | None = Depends(get_optional_current_researcher),
):
if not api_key and not current:
raise HTTPException(status_code=401, detail="Missing credentials")
validate_uuid_list(pub_ids)
_ensure_credentials(api_key, current)
_validate_pub_ids(pub_ids)
pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found")
@@ -117,51 +148,38 @@ async def export_multiple_zip(
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
if current:
for p in pubs:
exists = (
db.query(PublicationDownload)
.filter(
PublicationDownload.researcher_id == current.id,
PublicationDownload.publication_id == p.id,
)
.first()
)
if not exists:
db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id))
db.commit()
_record_downloads(db, current, pubs)
return Response(content=zip_bytes, media_type="application/zip")
# ---------------------------------------------------------
# ENDPOINT 4: ZIP por investigador
# ---------------------------------------------------------
@router.get("/zip/researcher/{orcid_id}")
@limiter.limit(settings.RATE_LIMIT_EXPORT)
async def export_researcher_zip(
orcid_id: str,
request: Request,
orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN),
db: Session = Depends(get_db),
api_key: str | None = Depends(get_api_key_optional),
current: Researcher | None = Depends(get_optional_current_researcher),
):
if not api_key and not current:
raise HTTPException(status_code=401, detail="Missing credentials")
_ensure_credentials(api_key, current)
if not is_valid_orcid(orcid_id):
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = db.query(Publication).filter_by(researcher_id=researcher.id).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found for this researcher")
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
if current:
for p in pubs:
exists = (
db.query(PublicationDownload)
.filter(
PublicationDownload.researcher_id == current.id,
PublicationDownload.publication_id == p.id,
)
.first()
)
if not exists:
db.add(PublicationDownload(researcher_id=current.id, publication_id=p.id))
db.commit()
_record_downloads(db, current, pubs)
return Response(content=zip_bytes, media_type="application/zip")
+62 -37
View File
@@ -2,11 +2,14 @@ from datetime import datetime
from typing import List
import httpx
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Path, Request
from sqlalchemy.orm import Session
from app.db.models import Publication, Researcher
from app.core.config import settings
from app.core.rate_limit import limiter
from app.db.models import Publication, PublicationDownload, Researcher
from app.db.session import get_db
from app.schema.publication import PublicationSchema
from app.schema.researcher import (
ResearcherBatchSearchRequestSchema,
ResearcherBatchSearchResponseSchema,
@@ -14,18 +17,15 @@ from app.schema.researcher import (
ResearcherStatsSchema,
ResearcherWithPublicationsSchema,
)
from app.security.jwt import get_current_researcher, get_optional_current_researcher
from app.services.normalizer import PublicationNormalizer
from app.services.orcid_client import get_display_name, get_works_summary, get_work_detail
from app.schema.publication import PublicationSchema
from app.db.models import PublicationDownload
from app.security.jwt import get_optional_current_researcher
from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
router = APIRouter(prefix="/researchers", tags=["researchers"])
# ---------------------------------------------------------
# Función auxiliar: detectar si una publicación ha cambiado
# ---------------------------------------------------------
def publication_changed(existing: Publication, data: dict) -> bool:
fields = [
"title", "subtitle", "type", "journal",
@@ -33,18 +33,13 @@ def publication_changed(existing: Publication, data: dict) -> bool:
"doi", "url", "short_description",
"citation_type", "citation_value",
"language_code", "country",
"external_ids", "contributors"
"external_ids", "contributors",
]
for f in fields:
if getattr(existing, f) != data[f]:
return True
return False
return any(getattr(existing, f) != data[f] for f in fields)
def build_researcher_stats(publications: list) -> ResearcherStatsSchema:
publication_types: dict[str, int] = {}
for publication in publications:
pub_type = getattr(publication, "type", None) or "unknown"
publication_types[pub_type] = publication_types.get(pub_type, 0) + 1
@@ -98,7 +93,7 @@ def _upsert_researcher_publications(
"doi", "url", "short_description",
"citation_type", "citation_value",
"language_code", "country",
"external_ids", "contributors"
"external_ids", "contributors",
]:
setattr(existing, field, data[field])
existing.last_modified = datetime.utcnow()
@@ -142,12 +137,17 @@ def _decorate_downloaded_by_me(
out: List[PublicationSchema] = []
for p in publications:
out.append(
PublicationSchema.model_validate(p).model_copy(update={"downloaded_by_me": p.id in downloaded_ids})
PublicationSchema.model_validate(p).model_copy(
update={"downloaded_by_me": p.id in downloaded_ids}
)
)
return out
def build_search_response(orcid_id: str, db: Session, current: Researcher | None) -> ResearcherWithPublicationsSchema:
if not is_valid_orcid(orcid_id):
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
researcher = db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first()
if not researcher:
researcher = Researcher(
@@ -159,10 +159,6 @@ def build_search_response(orcid_id: str, db: Session, current: Researcher | None
db.add(researcher)
db.flush()
# Si todavía no conocemos el nombre del investigador (por ejemplo, recién
# creado al sincronizarse desde el buscador), lo resolvemos contra el
# endpoint `/record` público de ORCID. No tocamos un nombre ya existente
# para no pisar valores establecidos por el flujo de autenticación.
if not researcher.name:
display_name = get_display_name(orcid_id)
if display_name:
@@ -185,10 +181,27 @@ def build_search_response(orcid_id: str, db: Session, current: Researcher | None
# ---------------------------------------------------------
# ENDPOINT 1: SEARCH + SYNC (sin contadores)
# ENDPOINT 1: SEARCH + SYNC
# ---------------------------------------------------------
@router.post("/search", response_model=ResearcherBatchSearchResponseSchema, response_model_exclude_none=True)
def _search_rate_limit(request: Request) -> str:
"""
Aplica un límite distinto si el usuario está autenticado.
Como SlowAPI evalúa el decorador antes de las dependencias, devolvemos
el límite más restrictivo y subimos sólo si hay token (state.researcher).
"""
researcher = getattr(request.state, "researcher", None)
return settings.RATE_LIMIT_SEARCH_AUTH if researcher else settings.RATE_LIMIT_SEARCH_ANON
@router.post(
"/search",
response_model=ResearcherBatchSearchResponseSchema,
response_model_exclude_none=True,
)
@limiter.limit(_search_rate_limit)
def search_and_sync_researchers(
request: Request,
payload: ResearcherBatchSearchRequestSchema,
db: Session = Depends(get_db),
current: Researcher | None = Depends(get_optional_current_researcher),
@@ -196,26 +209,33 @@ def search_and_sync_researchers(
results: List[ResearcherWithPublicationsSchema] = []
errors: List[ResearcherSearchErrorSchema] = []
# Evita llamadas duplicadas a ORCID conservando el orden de entrada.
unique_orcid_ids = list(dict.fromkeys(payload.orcid_ids))
for orcid_id in unique_orcid_ids:
try:
results.append(build_search_response(orcid_id, db, current))
except HTTPException as exc:
db.rollback()
errors.append(
ResearcherSearchErrorSchema(
orcid_id=orcid_id,
detail=str(exc.detail),
)
)
except httpx.HTTPStatusError as exc:
db.rollback()
errors.append(
ResearcherSearchErrorSchema(
orcid_id=orcid_id,
detail=f"ORCID devolvió {exc.response.status_code} para {orcid_id}.",
detail=f"ORCID returned {exc.response.status_code}",
)
)
except Exception as exc:
except Exception:
db.rollback()
errors.append(
ResearcherSearchErrorSchema(
orcid_id=orcid_id,
detail=str(exc),
detail="Unexpected error while processing ORCID iD",
)
)
@@ -228,14 +248,24 @@ def search_and_sync_researchers(
# ---------------------------------------------------------
# ENDPOINT 2: SYNC COMPLETO (con contadores + status)
# ENDPOINT 2: SYNC COMPLETO (requiere autenticación)
# ---------------------------------------------------------
@router.post("/{orcid_id}/sync", response_model=ResearcherWithPublicationsSchema, response_model_exclude_none=True)
@router.post(
"/{orcid_id}/sync",
response_model=ResearcherWithPublicationsSchema,
response_model_exclude_none=True,
)
@limiter.limit(settings.RATE_LIMIT_SYNC)
def sync_researcher(
orcid_id: str,
request: Request,
orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN),
db: Session = Depends(get_db),
current: Researcher | None = Depends(get_optional_current_researcher),
current: Researcher = Depends(get_current_researcher),
):
if not is_valid_orcid(orcid_id):
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
@@ -244,7 +274,6 @@ def sync_researcher(
groups = works.get("group", [])
publications_output = []
new_count = 0
updated_count = 0
unchanged_count = 0
@@ -277,21 +306,17 @@ def sync_researcher(
if existing:
if publication_changed(existing, data):
# updated
for field in data:
setattr(existing, field, data[field])
existing.last_modified = datetime.utcnow()
existing.status = "updated"
updated_count += 1
else:
# unchanged
existing.status = "unchanged"
unchanged_count += 1
pub = existing
else:
# new
pub = Publication(
researcher_id=researcher.id,
**data,