From 238742bda4d9817b7b80a47bcf84b40d5e1a3410 Mon Sep 17 00:00:00 2001 From: Mireya Cueto Garrido Date: Wed, 20 May 2026 12:25:20 +0200 Subject: [PATCH] feat(backend): rendimiento ORCID y autosync configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reutiliza cliente HTTP ORCID, reduce consultas DB al sincronizar works y añade SYNC_* en settings y scheduler (cron mensual o intervalo). Actualiza backend/.env, frontend/.env y .env.example para despliegue. --- backend/.env.example | 22 +++++++++- backend/app/api/researchers.py | 41 ++++++++--------- backend/app/core/config.py | 6 +++ backend/app/scheduler/sync_scheduler.py | 37 +++++++++++++++- backend/app/services/orcid_client.py | 58 ++++++++++++++----------- 5 files changed, 114 insertions(+), 50 deletions(-) diff --git a/backend/.env.example b/backend/.env.example index efe2676..7ec56d3 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -28,4 +28,24 @@ JWT_EXPIRES_MINUTES=720 JWT_ISSUER=orcid-sword-backend JWT_AUDIENCE=orcid-sword-frontend -DOCS_ENABLED=false \ No newline at end of file +DOCS_ENABLED=false + +# Rate limits (ajústalos por entorno) +RATE_LIMIT_DEFAULT=60/minute +RATE_LIMIT_AUTH=10/minute +RATE_LIMIT_SEARCH_ANON=5/minute +RATE_LIMIT_SEARCH_AUTH=30/minute +RATE_LIMIT_EXPORT=20/minute +RATE_LIMIT_SYNC=5/minute + +MAX_ORCID_BATCH=25 +MAX_PUB_IDS_BATCH=500 +MAX_REQUEST_BODY_BYTES=1048576 + +# Autosincronización (APScheduler): mensual por defecto; interval_minutes para pruebas +SYNC_SCHEDULER_ENABLED=true +SYNC_SCHEDULE_MODE=monthly_cron +SYNC_CRON_DAY=1 +SYNC_CRON_HOUR=3 +# SYNC_SCHEDULE_MODE=interval_minutes +# SYNC_INTERVAL_MINUTES=2 \ No newline at end of file diff --git a/backend/app/api/researchers.py b/backend/app/api/researchers.py index 82859df..1f07198 100644 --- a/backend/app/api/researchers.py +++ b/backend/app/api/researchers.py @@ -19,7 +19,7 @@ from app.schema.researcher import ( ) from app.security.jwt import get_optional_current_researcher from app.services.normalizer import PublicationNormalizer -from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary +from app.services.orcid_client import get_display_name, get_orcid_client from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid @@ -55,10 +55,15 @@ def _upsert_researcher_publications( orcid_id: str, db: Session, ) -> List[Publication]: - works = get_works_summary(orcid_id) + orcid_client = get_orcid_client() + works = orcid_client.fetch_works(orcid_id) groups = works.get("group", []) publications: List[Publication] = [] + existing_by_put_code = { + publication.put_code: publication + for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all() + } for g in groups: summaries = g.get("work-summary") or [] @@ -71,20 +76,13 @@ def _upsert_researcher_publications( continue try: - detail = get_work_detail(orcid_id, put_code) + detail = orcid_client.fetch_work_detail(orcid_id, put_code) except Exception: detail = None data = PublicationNormalizer.normalize(summary, detail) - existing = ( - db.query(Publication) - .filter( - Publication.researcher_id == researcher.id, - Publication.put_code == data["put_code"], - ) - .first() - ) + existing = existing_by_put_code.get(data["put_code"]) if existing: for field in [ @@ -108,6 +106,7 @@ def _upsert_researcher_publications( pub.status = None db.add(pub) publications.append(pub) + existing_by_put_code[data["put_code"]] = pub researcher.last_sync_at = datetime.utcnow() db.commit() @@ -261,13 +260,18 @@ def sync_researcher( if not researcher: raise HTTPException(status_code=404, detail="Researcher not found") - works = get_works_summary(orcid_id) + orcid_client = get_orcid_client() + works = orcid_client.fetch_works(orcid_id) groups = works.get("group", []) publications_output = [] new_count = 0 updated_count = 0 unchanged_count = 0 + existing_by_put_code = { + publication.put_code: publication + for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all() + } for g in groups: summaries = g.get("work-summary") or [] @@ -280,20 +284,13 @@ def sync_researcher( continue try: - detail = get_work_detail(orcid_id, put_code) + detail = orcid_client.fetch_work_detail(orcid_id, put_code) except Exception: detail = None data = PublicationNormalizer.normalize(summary, detail) - existing = ( - db.query(Publication) - .filter( - Publication.researcher_id == researcher.id, - Publication.put_code == data["put_code"], - ) - .first() - ) + existing = existing_by_put_code.get(data["put_code"]) if existing: if publication_changed(existing, data): @@ -316,8 +313,8 @@ def sync_researcher( pub.status = "new" db.add(pub) new_count += 1 + existing_by_put_code[data["put_code"]] = pub - db.flush() publications_output.append(pub) researcher.last_sync_at = datetime.utcnow() diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 5c706fb..40df40e 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -94,6 +94,12 @@ class Settings(BaseSettings): MAX_PUB_IDS_BATCH: int = 500 MAX_REQUEST_BODY_BYTES: int = 1_048_576 # 1 MiB + SYNC_SCHEDULER_ENABLED: bool = True + SYNC_SCHEDULE_MODE: Literal["monthly_cron", "interval_minutes"] = "monthly_cron" + SYNC_CRON_DAY: int = 1 + SYNC_CRON_HOUR: int = 3 + SYNC_INTERVAL_MINUTES: int = 60 + DOCS_ENABLED: bool = True SECURITY_HSTS_SECONDS: int = 31_536_000 diff --git a/backend/app/scheduler/sync_scheduler.py b/backend/app/scheduler/sync_scheduler.py index 69ce594..1b438ed 100644 --- a/backend/app/scheduler/sync_scheduler.py +++ b/backend/app/scheduler/sync_scheduler.py @@ -4,6 +4,9 @@ from app.db.session import SessionLocal from app.db.repositories.researcher_repository import ResearcherRepository from dotenv import load_dotenv import os +import logging + +from app.core.config import settings # Cargar variables del .env @@ -15,6 +18,7 @@ load_dotenv() API_KEY = os.getenv("API_KEY_VALUE") BASE_URL = os.getenv("BASE_URL") +logger = logging.getLogger("app.scheduler.sync") # --------------------------------------------------------- # Función auxiliar: ejecutar sincronización mensual @@ -48,6 +52,37 @@ def run_monthly_sync(): # --------------------------------------------------------- def start_scheduler(): + if not settings.SYNC_SCHEDULER_ENABLED: + logger.info("Autosync scheduler disabled by SYNC_SCHEDULER_ENABLED=false") + return + scheduler = BackgroundScheduler() - scheduler.add_job(run_monthly_sync, "cron", day=1, hour=3) # día 1 a las 03:00 + + if settings.SYNC_SCHEDULE_MODE == "interval_minutes": + scheduler.add_job( + run_monthly_sync, + "interval", + minutes=settings.SYNC_INTERVAL_MINUTES, + id="researchers-autosync", + replace_existing=True, + ) + logger.info( + "Autosync scheduler started in interval mode: every %s minute(s)", + settings.SYNC_INTERVAL_MINUTES, + ) + else: + scheduler.add_job( + run_monthly_sync, + "cron", + day=settings.SYNC_CRON_DAY, + hour=settings.SYNC_CRON_HOUR, + id="researchers-autosync", + replace_existing=True, + ) + logger.info( + "Autosync scheduler started in monthly mode: day=%s hour=%s", + settings.SYNC_CRON_DAY, + settings.SYNC_CRON_HOUR, + ) + scheduler.start() diff --git a/backend/app/services/orcid_client.py b/backend/app/services/orcid_client.py index 872ebe1..daeb6b5 100644 --- a/backend/app/services/orcid_client.py +++ b/backend/app/services/orcid_client.py @@ -37,6 +37,7 @@ class ORCIDClient: self.client_id = settings.ORCID_CLIENT_ID self.client_secret = settings.ORCID_CLIENT_SECRET self._token_cache: Optional[str] = None + self._http = httpx.Client(timeout=20.0) self.token_url = endpoints["token_url"] self.authorization_url = endpoints["authorization_url"] self.base_url = endpoints["api_base_url"] @@ -55,12 +56,11 @@ class ORCIDClient: "scope": "/read-public", } - with httpx.Client(timeout=20.0) as client: - response = client.post(self.token_url, data=data) - response.raise_for_status() - token = response.json()["access_token"] - self._token_cache = token - return token + response = self._http.post(self.token_url, data=data) + response.raise_for_status() + token = response.json()["access_token"] + self._token_cache = token + return token # --------------------------------------------------------- # Headers comunes @@ -77,31 +77,28 @@ class ORCIDClient: # --------------------------------------------------------- def fetch_record(self, orcid_id: str) -> dict: url = f"{self.base_url}/{orcid_id}/record" - with httpx.Client(timeout=20.0) as client: - response = client.get(url, headers=self._headers()) - response.raise_for_status() - return response.json() + response = self._http.get(url, headers=self._headers()) + response.raise_for_status() + return response.json() # --------------------------------------------------------- # 3. Consultar /works (summary) # --------------------------------------------------------- def fetch_works(self, orcid_id: str) -> dict: url = f"{self.base_url}/{orcid_id}/works" - with httpx.Client(timeout=20.0) as client: - response = client.get(url, headers=self._headers()) - response.raise_for_status() - return response.json() + response = self._http.get(url, headers=self._headers()) + response.raise_for_status() + return response.json() # --------------------------------------------------------- # 4. Consultar /work/{put_code} (detalle) # --------------------------------------------------------- def fetch_work_detail(self, orcid_id: str, put_code: int) -> dict | None: url = f"{self.base_url}/{orcid_id}/work/{put_code}" - with httpx.Client(timeout=20.0) as client: - response = client.get(url, headers=self._headers()) - if response.status_code != 200: - return None - return response.json() + response = self._http.get(url, headers=self._headers()) + if response.status_code != 200: + return None + return response.json() # --------------------------------------------------------- # OAuth 3-legged (authorization code) @@ -147,27 +144,36 @@ class ORCIDClient: "code": code, "redirect_uri": redirect_uri, } - with httpx.Client(timeout=20.0) as client: - response = client.post(self.token_url, data=data, headers={"Accept": "application/json"}) - response.raise_for_status() - return response.json() + response = self._http.post(self.token_url, data=data, headers={"Accept": "application/json"}) + response.raise_for_status() + return response.json() # ------------------------------------------------------------------- # Funciones de módulo usadas en researchers.py # ------------------------------------------------------------------- +_shared_client: ORCIDClient | None = None + + +def get_orcid_client() -> ORCIDClient: + global _shared_client + if _shared_client is None: + _shared_client = ORCIDClient() + return _shared_client + + def get_works_summary(orcid_id: str) -> dict: - client = ORCIDClient() + client = get_orcid_client() return client.fetch_works(orcid_id) def get_work_detail(orcid_id: str, put_code: int) -> dict | None: - client = ORCIDClient() + client = get_orcid_client() return client.fetch_work_detail(orcid_id, put_code) def get_record(orcid_id: str) -> dict: - client = ORCIDClient() + client = get_orcid_client() return client.fetch_record(orcid_id)