feat(backend): rendimiento ORCID y autosync configurable

Reutiliza cliente HTTP ORCID, reduce consultas DB al sincronizar works y añade SYNC_* en settings y scheduler (cron mensual o intervalo). Actualiza backend/.env, frontend/.env y .env.example para despliegue.
This commit is contained in:
Mireya Cueto Garrido
2026-05-20 12:25:20 +02:00
parent 5e0b5c4289
commit 238742bda4
5 changed files with 114 additions and 50 deletions
+20
View File
@@ -29,3 +29,23 @@ JWT_ISSUER=orcid-sword-backend
JWT_AUDIENCE=orcid-sword-frontend
DOCS_ENABLED=false
# Rate limits (ajústalos por entorno)
RATE_LIMIT_DEFAULT=60/minute
RATE_LIMIT_AUTH=10/minute
RATE_LIMIT_SEARCH_ANON=5/minute
RATE_LIMIT_SEARCH_AUTH=30/minute
RATE_LIMIT_EXPORT=20/minute
RATE_LIMIT_SYNC=5/minute
MAX_ORCID_BATCH=25
MAX_PUB_IDS_BATCH=500
MAX_REQUEST_BODY_BYTES=1048576
# Autosincronización (APScheduler): mensual por defecto; interval_minutes para pruebas
SYNC_SCHEDULER_ENABLED=true
SYNC_SCHEDULE_MODE=monthly_cron
SYNC_CRON_DAY=1
SYNC_CRON_HOUR=3
# SYNC_SCHEDULE_MODE=interval_minutes
# SYNC_INTERVAL_MINUTES=2
+19 -22
View File
@@ -19,7 +19,7 @@ from app.schema.researcher import (
)
from app.security.jwt import get_optional_current_researcher
from app.services.normalizer import PublicationNormalizer
from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary
from app.services.orcid_client import get_display_name, get_orcid_client
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
@@ -55,10 +55,15 @@ def _upsert_researcher_publications(
orcid_id: str,
db: Session,
) -> List[Publication]:
works = get_works_summary(orcid_id)
orcid_client = get_orcid_client()
works = orcid_client.fetch_works(orcid_id)
groups = works.get("group", [])
publications: List[Publication] = []
existing_by_put_code = {
publication.put_code: publication
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
}
for g in groups:
summaries = g.get("work-summary") or []
@@ -71,20 +76,13 @@ def _upsert_researcher_publications(
continue
try:
detail = get_work_detail(orcid_id, put_code)
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
except Exception:
detail = None
data = PublicationNormalizer.normalize(summary, detail)
existing = (
db.query(Publication)
.filter(
Publication.researcher_id == researcher.id,
Publication.put_code == data["put_code"],
)
.first()
)
existing = existing_by_put_code.get(data["put_code"])
if existing:
for field in [
@@ -108,6 +106,7 @@ def _upsert_researcher_publications(
pub.status = None
db.add(pub)
publications.append(pub)
existing_by_put_code[data["put_code"]] = pub
researcher.last_sync_at = datetime.utcnow()
db.commit()
@@ -261,13 +260,18 @@ def sync_researcher(
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
works = get_works_summary(orcid_id)
orcid_client = get_orcid_client()
works = orcid_client.fetch_works(orcid_id)
groups = works.get("group", [])
publications_output = []
new_count = 0
updated_count = 0
unchanged_count = 0
existing_by_put_code = {
publication.put_code: publication
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
}
for g in groups:
summaries = g.get("work-summary") or []
@@ -280,20 +284,13 @@ def sync_researcher(
continue
try:
detail = get_work_detail(orcid_id, put_code)
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
except Exception:
detail = None
data = PublicationNormalizer.normalize(summary, detail)
existing = (
db.query(Publication)
.filter(
Publication.researcher_id == researcher.id,
Publication.put_code == data["put_code"],
)
.first()
)
existing = existing_by_put_code.get(data["put_code"])
if existing:
if publication_changed(existing, data):
@@ -316,8 +313,8 @@ def sync_researcher(
pub.status = "new"
db.add(pub)
new_count += 1
existing_by_put_code[data["put_code"]] = pub
db.flush()
publications_output.append(pub)
researcher.last_sync_at = datetime.utcnow()
+6
View File
@@ -94,6 +94,12 @@ class Settings(BaseSettings):
MAX_PUB_IDS_BATCH: int = 500
MAX_REQUEST_BODY_BYTES: int = 1_048_576 # 1 MiB
SYNC_SCHEDULER_ENABLED: bool = True
SYNC_SCHEDULE_MODE: Literal["monthly_cron", "interval_minutes"] = "monthly_cron"
SYNC_CRON_DAY: int = 1
SYNC_CRON_HOUR: int = 3
SYNC_INTERVAL_MINUTES: int = 60
DOCS_ENABLED: bool = True
SECURITY_HSTS_SECONDS: int = 31_536_000
+36 -1
View File
@@ -4,6 +4,9 @@ from app.db.session import SessionLocal
from app.db.repositories.researcher_repository import ResearcherRepository
from dotenv import load_dotenv
import os
import logging
from app.core.config import settings
# Cargar variables del .env
@@ -15,6 +18,7 @@ load_dotenv()
API_KEY = os.getenv("API_KEY_VALUE")
BASE_URL = os.getenv("BASE_URL")
logger = logging.getLogger("app.scheduler.sync")
# ---------------------------------------------------------
# Función auxiliar: ejecutar sincronización mensual
@@ -48,6 +52,37 @@ def run_monthly_sync():
# ---------------------------------------------------------
def start_scheduler():
if not settings.SYNC_SCHEDULER_ENABLED:
logger.info("Autosync scheduler disabled by SYNC_SCHEDULER_ENABLED=false")
return
scheduler = BackgroundScheduler()
scheduler.add_job(run_monthly_sync, "cron", day=1, hour=3) # día 1 a las 03:00
if settings.SYNC_SCHEDULE_MODE == "interval_minutes":
scheduler.add_job(
run_monthly_sync,
"interval",
minutes=settings.SYNC_INTERVAL_MINUTES,
id="researchers-autosync",
replace_existing=True,
)
logger.info(
"Autosync scheduler started in interval mode: every %s minute(s)",
settings.SYNC_INTERVAL_MINUTES,
)
else:
scheduler.add_job(
run_monthly_sync,
"cron",
day=settings.SYNC_CRON_DAY,
hour=settings.SYNC_CRON_HOUR,
id="researchers-autosync",
replace_existing=True,
)
logger.info(
"Autosync scheduler started in monthly mode: day=%s hour=%s",
settings.SYNC_CRON_DAY,
settings.SYNC_CRON_HOUR,
)
scheduler.start()
+32 -26
View File
@@ -37,6 +37,7 @@ class ORCIDClient:
self.client_id = settings.ORCID_CLIENT_ID
self.client_secret = settings.ORCID_CLIENT_SECRET
self._token_cache: Optional[str] = None
self._http = httpx.Client(timeout=20.0)
self.token_url = endpoints["token_url"]
self.authorization_url = endpoints["authorization_url"]
self.base_url = endpoints["api_base_url"]
@@ -55,12 +56,11 @@ class ORCIDClient:
"scope": "/read-public",
}
with httpx.Client(timeout=20.0) as client:
response = client.post(self.token_url, data=data)
response.raise_for_status()
token = response.json()["access_token"]
self._token_cache = token
return token
response = self._http.post(self.token_url, data=data)
response.raise_for_status()
token = response.json()["access_token"]
self._token_cache = token
return token
# ---------------------------------------------------------
# Headers comunes
@@ -77,31 +77,28 @@ class ORCIDClient:
# ---------------------------------------------------------
def fetch_record(self, orcid_id: str) -> dict:
url = f"{self.base_url}/{orcid_id}/record"
with httpx.Client(timeout=20.0) as client:
response = client.get(url, headers=self._headers())
response.raise_for_status()
return response.json()
response = self._http.get(url, headers=self._headers())
response.raise_for_status()
return response.json()
# ---------------------------------------------------------
# 3. Consultar /works (summary)
# ---------------------------------------------------------
def fetch_works(self, orcid_id: str) -> dict:
url = f"{self.base_url}/{orcid_id}/works"
with httpx.Client(timeout=20.0) as client:
response = client.get(url, headers=self._headers())
response.raise_for_status()
return response.json()
response = self._http.get(url, headers=self._headers())
response.raise_for_status()
return response.json()
# ---------------------------------------------------------
# 4. Consultar /work/{put_code} (detalle)
# ---------------------------------------------------------
def fetch_work_detail(self, orcid_id: str, put_code: int) -> dict | None:
url = f"{self.base_url}/{orcid_id}/work/{put_code}"
with httpx.Client(timeout=20.0) as client:
response = client.get(url, headers=self._headers())
if response.status_code != 200:
return None
return response.json()
response = self._http.get(url, headers=self._headers())
if response.status_code != 200:
return None
return response.json()
# ---------------------------------------------------------
# OAuth 3-legged (authorization code)
@@ -147,27 +144,36 @@ class ORCIDClient:
"code": code,
"redirect_uri": redirect_uri,
}
with httpx.Client(timeout=20.0) as client:
response = client.post(self.token_url, data=data, headers={"Accept": "application/json"})
response.raise_for_status()
return response.json()
response = self._http.post(self.token_url, data=data, headers={"Accept": "application/json"})
response.raise_for_status()
return response.json()
# -------------------------------------------------------------------
# Funciones de módulo usadas en researchers.py
# -------------------------------------------------------------------
_shared_client: ORCIDClient | None = None
def get_orcid_client() -> ORCIDClient:
global _shared_client
if _shared_client is None:
_shared_client = ORCIDClient()
return _shared_client
def get_works_summary(orcid_id: str) -> dict:
client = ORCIDClient()
client = get_orcid_client()
return client.fetch_works(orcid_id)
def get_work_detail(orcid_id: str, put_code: int) -> dict | None:
client = ORCIDClient()
client = get_orcid_client()
return client.fetch_work_detail(orcid_id, put_code)
def get_record(orcid_id: str) -> dict:
client = ORCIDClient()
client = get_orcid_client()
return client.fetch_record(orcid_id)