feat(backend): rendimiento ORCID y autosync configurable
Reutiliza cliente HTTP ORCID, reduce consultas DB al sincronizar works y añade SYNC_* en settings y scheduler (cron mensual o intervalo). Actualiza backend/.env, frontend/.env y .env.example para despliegue.
This commit is contained in:
@@ -29,3 +29,23 @@ JWT_ISSUER=orcid-sword-backend
|
|||||||
JWT_AUDIENCE=orcid-sword-frontend
|
JWT_AUDIENCE=orcid-sword-frontend
|
||||||
|
|
||||||
DOCS_ENABLED=false
|
DOCS_ENABLED=false
|
||||||
|
|
||||||
|
# Rate limits (ajústalos por entorno)
|
||||||
|
RATE_LIMIT_DEFAULT=60/minute
|
||||||
|
RATE_LIMIT_AUTH=10/minute
|
||||||
|
RATE_LIMIT_SEARCH_ANON=5/minute
|
||||||
|
RATE_LIMIT_SEARCH_AUTH=30/minute
|
||||||
|
RATE_LIMIT_EXPORT=20/minute
|
||||||
|
RATE_LIMIT_SYNC=5/minute
|
||||||
|
|
||||||
|
MAX_ORCID_BATCH=25
|
||||||
|
MAX_PUB_IDS_BATCH=500
|
||||||
|
MAX_REQUEST_BODY_BYTES=1048576
|
||||||
|
|
||||||
|
# Autosincronización (APScheduler): mensual por defecto; interval_minutes para pruebas
|
||||||
|
SYNC_SCHEDULER_ENABLED=true
|
||||||
|
SYNC_SCHEDULE_MODE=monthly_cron
|
||||||
|
SYNC_CRON_DAY=1
|
||||||
|
SYNC_CRON_HOUR=3
|
||||||
|
# SYNC_SCHEDULE_MODE=interval_minutes
|
||||||
|
# SYNC_INTERVAL_MINUTES=2
|
||||||
@@ -19,7 +19,7 @@ from app.schema.researcher import (
|
|||||||
)
|
)
|
||||||
from app.security.jwt import get_optional_current_researcher
|
from app.security.jwt import get_optional_current_researcher
|
||||||
from app.services.normalizer import PublicationNormalizer
|
from app.services.normalizer import PublicationNormalizer
|
||||||
from app.services.orcid_client import get_display_name, get_work_detail, get_works_summary
|
from app.services.orcid_client import get_display_name, get_orcid_client
|
||||||
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
||||||
|
|
||||||
|
|
||||||
@@ -55,10 +55,15 @@ def _upsert_researcher_publications(
|
|||||||
orcid_id: str,
|
orcid_id: str,
|
||||||
db: Session,
|
db: Session,
|
||||||
) -> List[Publication]:
|
) -> List[Publication]:
|
||||||
works = get_works_summary(orcid_id)
|
orcid_client = get_orcid_client()
|
||||||
|
works = orcid_client.fetch_works(orcid_id)
|
||||||
groups = works.get("group", [])
|
groups = works.get("group", [])
|
||||||
|
|
||||||
publications: List[Publication] = []
|
publications: List[Publication] = []
|
||||||
|
existing_by_put_code = {
|
||||||
|
publication.put_code: publication
|
||||||
|
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
|
||||||
|
}
|
||||||
|
|
||||||
for g in groups:
|
for g in groups:
|
||||||
summaries = g.get("work-summary") or []
|
summaries = g.get("work-summary") or []
|
||||||
@@ -71,20 +76,13 @@ def _upsert_researcher_publications(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
detail = get_work_detail(orcid_id, put_code)
|
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||||
except Exception:
|
except Exception:
|
||||||
detail = None
|
detail = None
|
||||||
|
|
||||||
data = PublicationNormalizer.normalize(summary, detail)
|
data = PublicationNormalizer.normalize(summary, detail)
|
||||||
|
|
||||||
existing = (
|
existing = existing_by_put_code.get(data["put_code"])
|
||||||
db.query(Publication)
|
|
||||||
.filter(
|
|
||||||
Publication.researcher_id == researcher.id,
|
|
||||||
Publication.put_code == data["put_code"],
|
|
||||||
)
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing:
|
if existing:
|
||||||
for field in [
|
for field in [
|
||||||
@@ -108,6 +106,7 @@ def _upsert_researcher_publications(
|
|||||||
pub.status = None
|
pub.status = None
|
||||||
db.add(pub)
|
db.add(pub)
|
||||||
publications.append(pub)
|
publications.append(pub)
|
||||||
|
existing_by_put_code[data["put_code"]] = pub
|
||||||
|
|
||||||
researcher.last_sync_at = datetime.utcnow()
|
researcher.last_sync_at = datetime.utcnow()
|
||||||
db.commit()
|
db.commit()
|
||||||
@@ -261,13 +260,18 @@ def sync_researcher(
|
|||||||
if not researcher:
|
if not researcher:
|
||||||
raise HTTPException(status_code=404, detail="Researcher not found")
|
raise HTTPException(status_code=404, detail="Researcher not found")
|
||||||
|
|
||||||
works = get_works_summary(orcid_id)
|
orcid_client = get_orcid_client()
|
||||||
|
works = orcid_client.fetch_works(orcid_id)
|
||||||
groups = works.get("group", [])
|
groups = works.get("group", [])
|
||||||
|
|
||||||
publications_output = []
|
publications_output = []
|
||||||
new_count = 0
|
new_count = 0
|
||||||
updated_count = 0
|
updated_count = 0
|
||||||
unchanged_count = 0
|
unchanged_count = 0
|
||||||
|
existing_by_put_code = {
|
||||||
|
publication.put_code: publication
|
||||||
|
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
|
||||||
|
}
|
||||||
|
|
||||||
for g in groups:
|
for g in groups:
|
||||||
summaries = g.get("work-summary") or []
|
summaries = g.get("work-summary") or []
|
||||||
@@ -280,20 +284,13 @@ def sync_researcher(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
detail = get_work_detail(orcid_id, put_code)
|
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||||
except Exception:
|
except Exception:
|
||||||
detail = None
|
detail = None
|
||||||
|
|
||||||
data = PublicationNormalizer.normalize(summary, detail)
|
data = PublicationNormalizer.normalize(summary, detail)
|
||||||
|
|
||||||
existing = (
|
existing = existing_by_put_code.get(data["put_code"])
|
||||||
db.query(Publication)
|
|
||||||
.filter(
|
|
||||||
Publication.researcher_id == researcher.id,
|
|
||||||
Publication.put_code == data["put_code"],
|
|
||||||
)
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing:
|
if existing:
|
||||||
if publication_changed(existing, data):
|
if publication_changed(existing, data):
|
||||||
@@ -316,8 +313,8 @@ def sync_researcher(
|
|||||||
pub.status = "new"
|
pub.status = "new"
|
||||||
db.add(pub)
|
db.add(pub)
|
||||||
new_count += 1
|
new_count += 1
|
||||||
|
existing_by_put_code[data["put_code"]] = pub
|
||||||
|
|
||||||
db.flush()
|
|
||||||
publications_output.append(pub)
|
publications_output.append(pub)
|
||||||
|
|
||||||
researcher.last_sync_at = datetime.utcnow()
|
researcher.last_sync_at = datetime.utcnow()
|
||||||
|
|||||||
@@ -94,6 +94,12 @@ class Settings(BaseSettings):
|
|||||||
MAX_PUB_IDS_BATCH: int = 500
|
MAX_PUB_IDS_BATCH: int = 500
|
||||||
MAX_REQUEST_BODY_BYTES: int = 1_048_576 # 1 MiB
|
MAX_REQUEST_BODY_BYTES: int = 1_048_576 # 1 MiB
|
||||||
|
|
||||||
|
SYNC_SCHEDULER_ENABLED: bool = True
|
||||||
|
SYNC_SCHEDULE_MODE: Literal["monthly_cron", "interval_minutes"] = "monthly_cron"
|
||||||
|
SYNC_CRON_DAY: int = 1
|
||||||
|
SYNC_CRON_HOUR: int = 3
|
||||||
|
SYNC_INTERVAL_MINUTES: int = 60
|
||||||
|
|
||||||
DOCS_ENABLED: bool = True
|
DOCS_ENABLED: bool = True
|
||||||
|
|
||||||
SECURITY_HSTS_SECONDS: int = 31_536_000
|
SECURITY_HSTS_SECONDS: int = 31_536_000
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ from app.db.session import SessionLocal
|
|||||||
from app.db.repositories.researcher_repository import ResearcherRepository
|
from app.db.repositories.researcher_repository import ResearcherRepository
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from app.core.config import settings
|
||||||
|
|
||||||
|
|
||||||
# Cargar variables del .env
|
# Cargar variables del .env
|
||||||
@@ -15,6 +18,7 @@ load_dotenv()
|
|||||||
|
|
||||||
API_KEY = os.getenv("API_KEY_VALUE")
|
API_KEY = os.getenv("API_KEY_VALUE")
|
||||||
BASE_URL = os.getenv("BASE_URL")
|
BASE_URL = os.getenv("BASE_URL")
|
||||||
|
logger = logging.getLogger("app.scheduler.sync")
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# Función auxiliar: ejecutar sincronización mensual
|
# Función auxiliar: ejecutar sincronización mensual
|
||||||
@@ -48,6 +52,37 @@ def run_monthly_sync():
|
|||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
|
|
||||||
def start_scheduler():
|
def start_scheduler():
|
||||||
|
if not settings.SYNC_SCHEDULER_ENABLED:
|
||||||
|
logger.info("Autosync scheduler disabled by SYNC_SCHEDULER_ENABLED=false")
|
||||||
|
return
|
||||||
|
|
||||||
scheduler = BackgroundScheduler()
|
scheduler = BackgroundScheduler()
|
||||||
scheduler.add_job(run_monthly_sync, "cron", day=1, hour=3) # día 1 a las 03:00
|
|
||||||
|
if settings.SYNC_SCHEDULE_MODE == "interval_minutes":
|
||||||
|
scheduler.add_job(
|
||||||
|
run_monthly_sync,
|
||||||
|
"interval",
|
||||||
|
minutes=settings.SYNC_INTERVAL_MINUTES,
|
||||||
|
id="researchers-autosync",
|
||||||
|
replace_existing=True,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Autosync scheduler started in interval mode: every %s minute(s)",
|
||||||
|
settings.SYNC_INTERVAL_MINUTES,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
scheduler.add_job(
|
||||||
|
run_monthly_sync,
|
||||||
|
"cron",
|
||||||
|
day=settings.SYNC_CRON_DAY,
|
||||||
|
hour=settings.SYNC_CRON_HOUR,
|
||||||
|
id="researchers-autosync",
|
||||||
|
replace_existing=True,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Autosync scheduler started in monthly mode: day=%s hour=%s",
|
||||||
|
settings.SYNC_CRON_DAY,
|
||||||
|
settings.SYNC_CRON_HOUR,
|
||||||
|
)
|
||||||
|
|
||||||
scheduler.start()
|
scheduler.start()
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ class ORCIDClient:
|
|||||||
self.client_id = settings.ORCID_CLIENT_ID
|
self.client_id = settings.ORCID_CLIENT_ID
|
||||||
self.client_secret = settings.ORCID_CLIENT_SECRET
|
self.client_secret = settings.ORCID_CLIENT_SECRET
|
||||||
self._token_cache: Optional[str] = None
|
self._token_cache: Optional[str] = None
|
||||||
|
self._http = httpx.Client(timeout=20.0)
|
||||||
self.token_url = endpoints["token_url"]
|
self.token_url = endpoints["token_url"]
|
||||||
self.authorization_url = endpoints["authorization_url"]
|
self.authorization_url = endpoints["authorization_url"]
|
||||||
self.base_url = endpoints["api_base_url"]
|
self.base_url = endpoints["api_base_url"]
|
||||||
@@ -55,8 +56,7 @@ class ORCIDClient:
|
|||||||
"scope": "/read-public",
|
"scope": "/read-public",
|
||||||
}
|
}
|
||||||
|
|
||||||
with httpx.Client(timeout=20.0) as client:
|
response = self._http.post(self.token_url, data=data)
|
||||||
response = client.post(self.token_url, data=data)
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
token = response.json()["access_token"]
|
token = response.json()["access_token"]
|
||||||
self._token_cache = token
|
self._token_cache = token
|
||||||
@@ -77,8 +77,7 @@ class ORCIDClient:
|
|||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
def fetch_record(self, orcid_id: str) -> dict:
|
def fetch_record(self, orcid_id: str) -> dict:
|
||||||
url = f"{self.base_url}/{orcid_id}/record"
|
url = f"{self.base_url}/{orcid_id}/record"
|
||||||
with httpx.Client(timeout=20.0) as client:
|
response = self._http.get(url, headers=self._headers())
|
||||||
response = client.get(url, headers=self._headers())
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
@@ -87,8 +86,7 @@ class ORCIDClient:
|
|||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
def fetch_works(self, orcid_id: str) -> dict:
|
def fetch_works(self, orcid_id: str) -> dict:
|
||||||
url = f"{self.base_url}/{orcid_id}/works"
|
url = f"{self.base_url}/{orcid_id}/works"
|
||||||
with httpx.Client(timeout=20.0) as client:
|
response = self._http.get(url, headers=self._headers())
|
||||||
response = client.get(url, headers=self._headers())
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
@@ -97,8 +95,7 @@ class ORCIDClient:
|
|||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
def fetch_work_detail(self, orcid_id: str, put_code: int) -> dict | None:
|
def fetch_work_detail(self, orcid_id: str, put_code: int) -> dict | None:
|
||||||
url = f"{self.base_url}/{orcid_id}/work/{put_code}"
|
url = f"{self.base_url}/{orcid_id}/work/{put_code}"
|
||||||
with httpx.Client(timeout=20.0) as client:
|
response = self._http.get(url, headers=self._headers())
|
||||||
response = client.get(url, headers=self._headers())
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
return None
|
return None
|
||||||
return response.json()
|
return response.json()
|
||||||
@@ -147,8 +144,7 @@ class ORCIDClient:
|
|||||||
"code": code,
|
"code": code,
|
||||||
"redirect_uri": redirect_uri,
|
"redirect_uri": redirect_uri,
|
||||||
}
|
}
|
||||||
with httpx.Client(timeout=20.0) as client:
|
response = self._http.post(self.token_url, data=data, headers={"Accept": "application/json"})
|
||||||
response = client.post(self.token_url, data=data, headers={"Accept": "application/json"})
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
@@ -156,18 +152,28 @@ class ORCIDClient:
|
|||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Funciones de módulo usadas en researchers.py
|
# Funciones de módulo usadas en researchers.py
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
|
_shared_client: ORCIDClient | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_orcid_client() -> ORCIDClient:
|
||||||
|
global _shared_client
|
||||||
|
if _shared_client is None:
|
||||||
|
_shared_client = ORCIDClient()
|
||||||
|
return _shared_client
|
||||||
|
|
||||||
|
|
||||||
def get_works_summary(orcid_id: str) -> dict:
|
def get_works_summary(orcid_id: str) -> dict:
|
||||||
client = ORCIDClient()
|
client = get_orcid_client()
|
||||||
return client.fetch_works(orcid_id)
|
return client.fetch_works(orcid_id)
|
||||||
|
|
||||||
|
|
||||||
def get_work_detail(orcid_id: str, put_code: int) -> dict | None:
|
def get_work_detail(orcid_id: str, put_code: int) -> dict | None:
|
||||||
client = ORCIDClient()
|
client = get_orcid_client()
|
||||||
return client.fetch_work_detail(orcid_id, put_code)
|
return client.fetch_work_detail(orcid_id, put_code)
|
||||||
|
|
||||||
|
|
||||||
def get_record(orcid_id: str) -> dict:
|
def get_record(orcid_id: str) -> dict:
|
||||||
client = ORCIDClient()
|
client = get_orcid_client()
|
||||||
return client.fetch_record(orcid_id)
|
return client.fetch_record(orcid_id)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user