fix(backend): sync sin N+1 a ORCID en perfiles grandes
Por defecto solo se usa /works; detalle /work/{put_code} opcional con tope ORCID_WORK_DETAIL_ENRICH_MAX. Parámetro enrich_work_details en búsqueda y query en sync.
This commit is contained in:
@@ -47,5 +47,5 @@ SYNC_SCHEDULER_ENABLED=true
|
||||
SYNC_SCHEDULE_MODE=monthly_cron
|
||||
SYNC_CRON_DAY=1
|
||||
SYNC_CRON_HOUR=3
|
||||
# SYNC_SCHEDULE_MODE=interval_minutes
|
||||
# SYNC_INTERVAL_MINUTES=2
|
||||
# Máximo de GET /work/{put_code} cuando enrich_work_details=true (tope duro interno 200)
|
||||
ORCID_WORK_DETAIL_ENRICH_MAX=50
|
||||
@@ -2,7 +2,7 @@ from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, Depends, HTTPException, Path, Request
|
||||
from fastapi import APIRouter, Depends, HTTPException, Path, Query, Request
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import settings
|
||||
@@ -54,6 +54,8 @@ def _upsert_researcher_publications(
|
||||
researcher: Researcher,
|
||||
orcid_id: str,
|
||||
db: Session,
|
||||
*,
|
||||
enrich_work_details: bool = False,
|
||||
) -> List[Publication]:
|
||||
orcid_client = get_orcid_client()
|
||||
works = orcid_client.fetch_works(orcid_id)
|
||||
@@ -65,6 +67,11 @@ def _upsert_researcher_publications(
|
||||
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
|
||||
}
|
||||
|
||||
# Tope duro para evitar timeouts y abuso aunque el .env pida un valor enorme.
|
||||
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
|
||||
detail_budget = detail_cap if enrich_work_details else 0
|
||||
detail_attempts = 0
|
||||
|
||||
for g in groups:
|
||||
summaries = g.get("work-summary") or []
|
||||
if not summaries:
|
||||
@@ -75,10 +82,13 @@ def _upsert_researcher_publications(
|
||||
if put_code is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||
except Exception:
|
||||
detail = None
|
||||
detail = None
|
||||
if detail_budget > 0 and detail_attempts < detail_budget:
|
||||
try:
|
||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||
except Exception:
|
||||
detail = None
|
||||
detail_attempts += 1
|
||||
|
||||
data = PublicationNormalizer.normalize(summary, detail)
|
||||
|
||||
@@ -143,7 +153,13 @@ def _decorate_downloaded_by_me(
|
||||
return out
|
||||
|
||||
|
||||
def build_search_response(orcid_id: str, db: Session, current: Researcher | None) -> ResearcherWithPublicationsSchema:
|
||||
def build_search_response(
|
||||
orcid_id: str,
|
||||
db: Session,
|
||||
current: Researcher | None,
|
||||
*,
|
||||
enrich_work_details: bool = False,
|
||||
) -> ResearcherWithPublicationsSchema:
|
||||
if not is_valid_orcid(orcid_id):
|
||||
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
|
||||
|
||||
@@ -164,7 +180,12 @@ def build_search_response(orcid_id: str, db: Session, current: Researcher | None
|
||||
researcher.name = display_name
|
||||
db.flush()
|
||||
|
||||
publications = _upsert_researcher_publications(researcher, orcid_id, db)
|
||||
publications = _upsert_researcher_publications(
|
||||
researcher,
|
||||
orcid_id,
|
||||
db,
|
||||
enrich_work_details=enrich_work_details,
|
||||
)
|
||||
publications_out = _decorate_downloaded_by_me(db=db, current=current, publications=publications)
|
||||
stats = build_researcher_stats(publications_out)
|
||||
|
||||
@@ -203,7 +224,14 @@ def search_and_sync_researchers(
|
||||
|
||||
for orcid_id in unique_orcid_ids:
|
||||
try:
|
||||
results.append(build_search_response(orcid_id, db, current))
|
||||
results.append(
|
||||
build_search_response(
|
||||
orcid_id,
|
||||
db,
|
||||
current,
|
||||
enrich_work_details=payload.enrich_work_details,
|
||||
)
|
||||
)
|
||||
except HTTPException as exc:
|
||||
db.rollback()
|
||||
errors.append(
|
||||
@@ -250,6 +278,14 @@ def search_and_sync_researchers(
|
||||
def sync_researcher(
|
||||
request: Request,
|
||||
orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN),
|
||||
enrich_work_details: bool = Query(
|
||||
False,
|
||||
description=(
|
||||
"Si es true, consulta ORCID GET /work/{put_code} hasta min(ORCID_WORK_DETAIL_ENRICH_MAX, 200) "
|
||||
"veces por perfil; el resto usa solo el resumen de /works. Por defecto false (recomendado para "
|
||||
"perfiles con muchas publicaciones)."
|
||||
),
|
||||
),
|
||||
db: Session = Depends(get_db),
|
||||
current: Researcher | None = Depends(get_optional_current_researcher),
|
||||
):
|
||||
@@ -273,6 +309,11 @@ def sync_researcher(
|
||||
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
|
||||
}
|
||||
|
||||
# Tope duro para evitar timeouts y abuso aunque el .env pida un valor enorme.
|
||||
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
|
||||
detail_budget = detail_cap if enrich_work_details else 0
|
||||
detail_attempts = 0
|
||||
|
||||
for g in groups:
|
||||
summaries = g.get("work-summary") or []
|
||||
if not summaries:
|
||||
@@ -283,10 +324,13 @@ def sync_researcher(
|
||||
if put_code is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||
except Exception:
|
||||
detail = None
|
||||
detail = None
|
||||
if detail_budget > 0 and detail_attempts < detail_budget:
|
||||
try:
|
||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||
except Exception:
|
||||
detail = None
|
||||
detail_attempts += 1
|
||||
|
||||
data = PublicationNormalizer.normalize(summary, detail)
|
||||
|
||||
|
||||
@@ -100,6 +100,11 @@ class Settings(BaseSettings):
|
||||
SYNC_CRON_HOUR: int = 3
|
||||
SYNC_INTERVAL_MINUTES: int = 60
|
||||
|
||||
# Por publicación, GET /work/{put_code} es muy costoso (timeouts con cientos de works).
|
||||
# Por defecto solo se usa el resumen de GET /works. Si se pide enrich, como máximo
|
||||
# se harán tantas peticiones de detalle (el resto se normaliza solo con summary).
|
||||
ORCID_WORK_DETAIL_ENRICH_MAX: int = 50
|
||||
|
||||
DOCS_ENABLED: bool = True
|
||||
|
||||
SECURITY_HSTS_SECONDS: int = 31_536_000
|
||||
|
||||
@@ -42,6 +42,9 @@ class ResearcherBatchSearchRequestSchema(BaseModel):
|
||||
min_length=1,
|
||||
max_length=settings.MAX_ORCID_BATCH,
|
||||
)
|
||||
# Si es true, se consulta /work/{put_code} hasta ORCID_WORK_DETAIL_ENRICH_MAX veces
|
||||
# por investigador (contribuidores, citación, etc.); el resto solo usa summary.
|
||||
enrich_work_details: bool = False
|
||||
|
||||
@field_validator("orcid_ids")
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user