feat(backend): rendimiento ORCID y autosync configurable
Reutiliza cliente HTTP ORCID, reduce consultas DB al sincronizar works y añade SYNC_* en settings y scheduler (cron mensual o intervalo). Actualiza backend/.env, frontend/.env y .env.example para despliegue.
This commit is contained in:
@@ -29,3 +29,23 @@ JWT_ISSUER=orcid-sword-backend
|
||||
JWT_AUDIENCE=orcid-sword-frontend
|
||||
|
||||
DOCS_ENABLED=false
|
||||
|
||||
# Rate limits (ajústalos por entorno)
|
||||
RATE_LIMIT_DEFAULT=60/minute
|
||||
RATE_LIMIT_AUTH=10/minute
|
||||
RATE_LIMIT_SEARCH_ANON=5/minute
|
||||
RATE_LIMIT_SEARCH_AUTH=30/minute
|
||||
RATE_LIMIT_EXPORT=20/minute
|
||||
RATE_LIMIT_SYNC=5/minute
|
||||
|
||||
MAX_ORCID_BATCH=25
|
||||
MAX_PUB_IDS_BATCH=500
|
||||
MAX_REQUEST_BODY_BYTES=1048576
|
||||
|
||||
# Autosincronización (APScheduler): mensual por defecto; interval_minutes para pruebas
|
||||
SYNC_SCHEDULER_ENABLED=true
|
||||
SYNC_SCHEDULE_MODE=monthly_cron
|
||||
SYNC_CRON_DAY=1
|
||||
SYNC_CRON_HOUR=3
|
||||
# SYNC_SCHEDULE_MODE=interval_minutes
|
||||
# SYNC_INTERVAL_MINUTES=2
|
||||
@@ -19,7 +19,7 @@ from app.schema.researcher import (
|
||||
)
|
||||
from app.security.jwt import get_optional_current_researcher
|
||||
from app.services.normalizer import PublicationNormalizer
|
||||
from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary
|
||||
from app.services.orcid_client import get_display_name, get_orcid_client
|
||||
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
||||
|
||||
|
||||
@@ -55,10 +55,15 @@ def _upsert_researcher_publications(
|
||||
orcid_id: str,
|
||||
db: Session,
|
||||
) -> List[Publication]:
|
||||
works = get_works_summary(orcid_id)
|
||||
orcid_client = get_orcid_client()
|
||||
works = orcid_client.fetch_works(orcid_id)
|
||||
groups = works.get("group", [])
|
||||
|
||||
publications: List[Publication] = []
|
||||
existing_by_put_code = {
|
||||
publication.put_code: publication
|
||||
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
|
||||
}
|
||||
|
||||
for g in groups:
|
||||
summaries = g.get("work-summary") or []
|
||||
@@ -71,20 +76,13 @@ def _upsert_researcher_publications(
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = get_work_detail(orcid_id, put_code)
|
||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||
except Exception:
|
||||
detail = None
|
||||
|
||||
data = PublicationNormalizer.normalize(summary, detail)
|
||||
|
||||
existing = (
|
||||
db.query(Publication)
|
||||
.filter(
|
||||
Publication.researcher_id == researcher.id,
|
||||
Publication.put_code == data["put_code"],
|
||||
)
|
||||
.first()
|
||||
)
|
||||
existing = existing_by_put_code.get(data["put_code"])
|
||||
|
||||
if existing:
|
||||
for field in [
|
||||
@@ -108,6 +106,7 @@ def _upsert_researcher_publications(
|
||||
pub.status = None
|
||||
db.add(pub)
|
||||
publications.append(pub)
|
||||
existing_by_put_code[data["put_code"]] = pub
|
||||
|
||||
researcher.last_sync_at = datetime.utcnow()
|
||||
db.commit()
|
||||
@@ -261,13 +260,18 @@ def sync_researcher(
|
||||
if not researcher:
|
||||
raise HTTPException(status_code=404, detail="Researcher not found")
|
||||
|
||||
works = get_works_summary(orcid_id)
|
||||
orcid_client = get_orcid_client()
|
||||
works = orcid_client.fetch_works(orcid_id)
|
||||
groups = works.get("group", [])
|
||||
|
||||
publications_output = []
|
||||
new_count = 0
|
||||
updated_count = 0
|
||||
unchanged_count = 0
|
||||
existing_by_put_code = {
|
||||
publication.put_code: publication
|
||||
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
|
||||
}
|
||||
|
||||
for g in groups:
|
||||
summaries = g.get("work-summary") or []
|
||||
@@ -280,20 +284,13 @@ def sync_researcher(
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = get_work_detail(orcid_id, put_code)
|
||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||
except Exception:
|
||||
detail = None
|
||||
|
||||
data = PublicationNormalizer.normalize(summary, detail)
|
||||
|
||||
existing = (
|
||||
db.query(Publication)
|
||||
.filter(
|
||||
Publication.researcher_id == researcher.id,
|
||||
Publication.put_code == data["put_code"],
|
||||
)
|
||||
.first()
|
||||
)
|
||||
existing = existing_by_put_code.get(data["put_code"])
|
||||
|
||||
if existing:
|
||||
if publication_changed(existing, data):
|
||||
@@ -316,8 +313,8 @@ def sync_researcher(
|
||||
pub.status = "new"
|
||||
db.add(pub)
|
||||
new_count += 1
|
||||
existing_by_put_code[data["put_code"]] = pub
|
||||
|
||||
db.flush()
|
||||
publications_output.append(pub)
|
||||
|
||||
researcher.last_sync_at = datetime.utcnow()
|
||||
|
||||
@@ -94,6 +94,12 @@ class Settings(BaseSettings):
|
||||
MAX_PUB_IDS_BATCH: int = 500
|
||||
MAX_REQUEST_BODY_BYTES: int = 1_048_576 # 1 MiB
|
||||
|
||||
SYNC_SCHEDULER_ENABLED: bool = True
|
||||
SYNC_SCHEDULE_MODE: Literal["monthly_cron", "interval_minutes"] = "monthly_cron"
|
||||
SYNC_CRON_DAY: int = 1
|
||||
SYNC_CRON_HOUR: int = 3
|
||||
SYNC_INTERVAL_MINUTES: int = 60
|
||||
|
||||
DOCS_ENABLED: bool = True
|
||||
|
||||
SECURITY_HSTS_SECONDS: int = 31_536_000
|
||||
|
||||
@@ -4,6 +4,9 @@ from app.db.session import SessionLocal
|
||||
from app.db.repositories.researcher_repository import ResearcherRepository
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import logging
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
# Cargar variables del .env
|
||||
@@ -15,6 +18,7 @@ load_dotenv()
|
||||
|
||||
API_KEY = os.getenv("API_KEY_VALUE")
|
||||
BASE_URL = os.getenv("BASE_URL")
|
||||
logger = logging.getLogger("app.scheduler.sync")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Función auxiliar: ejecutar sincronización mensual
|
||||
@@ -48,6 +52,37 @@ def run_monthly_sync():
|
||||
# ---------------------------------------------------------
|
||||
|
||||
def start_scheduler():
|
||||
if not settings.SYNC_SCHEDULER_ENABLED:
|
||||
logger.info("Autosync scheduler disabled by SYNC_SCHEDULER_ENABLED=false")
|
||||
return
|
||||
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler.add_job(run_monthly_sync, "cron", day=1, hour=3) # día 1 a las 03:00
|
||||
|
||||
if settings.SYNC_SCHEDULE_MODE == "interval_minutes":
|
||||
scheduler.add_job(
|
||||
run_monthly_sync,
|
||||
"interval",
|
||||
minutes=settings.SYNC_INTERVAL_MINUTES,
|
||||
id="researchers-autosync",
|
||||
replace_existing=True,
|
||||
)
|
||||
logger.info(
|
||||
"Autosync scheduler started in interval mode: every %s minute(s)",
|
||||
settings.SYNC_INTERVAL_MINUTES,
|
||||
)
|
||||
else:
|
||||
scheduler.add_job(
|
||||
run_monthly_sync,
|
||||
"cron",
|
||||
day=settings.SYNC_CRON_DAY,
|
||||
hour=settings.SYNC_CRON_HOUR,
|
||||
id="researchers-autosync",
|
||||
replace_existing=True,
|
||||
)
|
||||
logger.info(
|
||||
"Autosync scheduler started in monthly mode: day=%s hour=%s",
|
||||
settings.SYNC_CRON_DAY,
|
||||
settings.SYNC_CRON_HOUR,
|
||||
)
|
||||
|
||||
scheduler.start()
|
||||
|
||||
@@ -37,6 +37,7 @@ class ORCIDClient:
|
||||
self.client_id = settings.ORCID_CLIENT_ID
|
||||
self.client_secret = settings.ORCID_CLIENT_SECRET
|
||||
self._token_cache: Optional[str] = None
|
||||
self._http = httpx.Client(timeout=20.0)
|
||||
self.token_url = endpoints["token_url"]
|
||||
self.authorization_url = endpoints["authorization_url"]
|
||||
self.base_url = endpoints["api_base_url"]
|
||||
@@ -55,12 +56,11 @@ class ORCIDClient:
|
||||
"scope": "/read-public",
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=20.0) as client:
|
||||
response = client.post(self.token_url, data=data)
|
||||
response.raise_for_status()
|
||||
token = response.json()["access_token"]
|
||||
self._token_cache = token
|
||||
return token
|
||||
response = self._http.post(self.token_url, data=data)
|
||||
response.raise_for_status()
|
||||
token = response.json()["access_token"]
|
||||
self._token_cache = token
|
||||
return token
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Headers comunes
|
||||
@@ -77,31 +77,28 @@ class ORCIDClient:
|
||||
# ---------------------------------------------------------
|
||||
def fetch_record(self, orcid_id: str) -> dict:
|
||||
url = f"{self.base_url}/{orcid_id}/record"
|
||||
with httpx.Client(timeout=20.0) as client:
|
||||
response = client.get(url, headers=self._headers())
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
response = self._http.get(url, headers=self._headers())
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 3. Consultar /works (summary)
|
||||
# ---------------------------------------------------------
|
||||
def fetch_works(self, orcid_id: str) -> dict:
|
||||
url = f"{self.base_url}/{orcid_id}/works"
|
||||
with httpx.Client(timeout=20.0) as client:
|
||||
response = client.get(url, headers=self._headers())
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
response = self._http.get(url, headers=self._headers())
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 4. Consultar /work/{put_code} (detalle)
|
||||
# ---------------------------------------------------------
|
||||
def fetch_work_detail(self, orcid_id: str, put_code: int) -> dict | None:
|
||||
url = f"{self.base_url}/{orcid_id}/work/{put_code}"
|
||||
with httpx.Client(timeout=20.0) as client:
|
||||
response = client.get(url, headers=self._headers())
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
return response.json()
|
||||
response = self._http.get(url, headers=self._headers())
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
return response.json()
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# OAuth 3-legged (authorization code)
|
||||
@@ -147,27 +144,36 @@ class ORCIDClient:
|
||||
"code": code,
|
||||
"redirect_uri": redirect_uri,
|
||||
}
|
||||
with httpx.Client(timeout=20.0) as client:
|
||||
response = client.post(self.token_url, data=data, headers={"Accept": "application/json"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
response = self._http.post(self.token_url, data=data, headers={"Accept": "application/json"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Funciones de módulo usadas en researchers.py
|
||||
# -------------------------------------------------------------------
|
||||
_shared_client: ORCIDClient | None = None
|
||||
|
||||
|
||||
def get_orcid_client() -> ORCIDClient:
|
||||
global _shared_client
|
||||
if _shared_client is None:
|
||||
_shared_client = ORCIDClient()
|
||||
return _shared_client
|
||||
|
||||
|
||||
def get_works_summary(orcid_id: str) -> dict:
|
||||
client = ORCIDClient()
|
||||
client = get_orcid_client()
|
||||
return client.fetch_works(orcid_id)
|
||||
|
||||
|
||||
def get_work_detail(orcid_id: str, put_code: int) -> dict | None:
|
||||
client = ORCIDClient()
|
||||
client = get_orcid_client()
|
||||
return client.fetch_work_detail(orcid_id, put_code)
|
||||
|
||||
|
||||
def get_record(orcid_id: str) -> dict:
|
||||
client = ORCIDClient()
|
||||
client = get_orcid_client()
|
||||
return client.fetch_record(orcid_id)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user