feat(backend): detalle ORCID en export y sync sin borrar datos

Enriquece obras al exportar SWORD/ZIP, pide detalle en obras nuevas (con tope), preserva campos de detalle en re-sync y desenvuelve la respuesta work de ORCID.
2026-05-20 12:56:02 +02:00
parent 330f0dd62b
commit 9b596af494
5 changed files with 254 additions and 22 deletions
@@ -10,6 +10,8 @@ from app.core.rate_limit import limiter
 from app.db.models import Publication, PublicationDownload, Researcher
 from app.db.session import get_db
 from app.security.export_auth import require_export_access
 from app.services.orcid_client import get_display_name
 from app.services.publication_enrichment import enrich_publications_from_orcid
 from app.services.sword_generator import SWORDGenerator
 from app.services.zip_generator import ZIPGenerator
 from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
@@ -51,6 +53,22 @@ def _record_downloads(db: Session, current: Researcher, pubs: Iterable[Publicati
        db.commit()
 def _prepare_researcher_and_publications_for_export(
    db: Session,
    researcher: Researcher,
    pubs: List[Publication],
 ) -> None:
    """Nombre del investigador y detalle ORCID de obras antes de generar SWORD/ZIP."""
    if not researcher.name:
        display_name = get_display_name(researcher.orcid_id)
        if display_name:
            researcher.name = display_name
            db.commit()
            db.refresh(researcher)
    enrich_publications_from_orcid(db, researcher, pubs)
 def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]:
    if len(pub_ids) > settings.MAX_PUB_IDS_BATCH:
        raise HTTPException(status_code=413, detail="Too many publication IDs")
@@ -98,6 +116,10 @@ async def export_multiple_sword(
        raise HTTPException(status_code=404, detail="No publications found")
    researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
    if not researcher:
        raise HTTPException(status_code=404, detail="Researcher not found")
    _prepare_researcher_and_publications_for_export(db, researcher, pubs)
    xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
    if current:
@@ -129,6 +151,8 @@ async def export_researcher_sword(
    if not pubs:
        raise HTTPException(status_code=404, detail="No publications found for this researcher")
    _prepare_researcher_and_publications_for_export(db, researcher, pubs)
    xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
    if current:
        _record_downloads(db, current, pubs)
@@ -156,6 +180,10 @@ async def export_multiple_zip(
        raise HTTPException(status_code=404, detail="No publications found")
    researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
    if not researcher:
        raise HTTPException(status_code=404, detail="Researcher not found")
    _prepare_researcher_and_publications_for_export(db, researcher, pubs)
    zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
    if current:
@@ -187,6 +215,8 @@ async def export_researcher_zip(
    if not pubs:
        raise HTTPException(status_code=404, detail="No publications found for this researcher")
    _prepare_researcher_and_publications_for_export(db, researcher, pubs)
    zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
    if current:
        _record_downloads(db, current, pubs)
@@ -20,13 +20,19 @@ from app.schema.researcher import (
 from app.security.jwt import get_optional_current_researcher
 from app.services.normalizer import PublicationNormalizer
 from app.services.orcid_client import get_display_name, get_orcid_client
 from app.services.publication_enrichment import DETAIL_ONLY_FIELDS, apply_publication_data
 from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
 router = APIRouter(prefix="/researchers", tags=["researchers"])
-def publication_changed(existing: Publication, data: dict) -> bool:
+def publication_changed(
    existing: Publication,
    data: dict,
    *,
    include_detail_fields: bool = True,
 ) -> bool:
    fields = [
        "title", "subtitle", "type", "journal",
        "pub_year", "pub_month", "pub_day",
@@ -35,6 +41,8 @@ def publication_changed(existing: Publication, data: dict) -> bool:
        "language_code", "country",
        "external_ids", "contributors",
    ]
    if not include_detail_fields:
        fields = [f for f in fields if f not in DETAIL_ONLY_FIELDS]
    return any(getattr(existing, f) != data[f] for f in fields)
@@ -71,6 +79,8 @@ def _upsert_researcher_publications(
    detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
    detail_budget = detail_cap if enrich_work_details else 0
    detail_attempts = 0
    new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200))
    new_detail_attempts = 0
    for g in groups:
        summaries = g.get("work-summary") or []
@@ -82,28 +92,36 @@ def _upsert_researcher_publications(
        if put_code is None:
            continue
        existing = existing_by_put_code.get(put_code)
        detail = None
-        if detail_budget > 0 and detail_attempts < detail_budget:
+        fetch_detail = (
            existing is None
            and new_detail_cap > 0
            and new_detail_attempts < new_detail_cap
        )
        if (
            not fetch_detail
            and enrich_work_details
            and detail_budget > 0
            and detail_attempts < detail_budget
        ):
            fetch_detail = True
        if fetch_detail:
            try:
                detail = orcid_client.fetch_work_detail(orcid_id, put_code)
            except Exception:
                detail = None
-            detail_attempts += 1
+            if existing is None:
                new_detail_attempts += 1
            else:
                detail_attempts += 1
        data = PublicationNormalizer.normalize(summary, detail)
-
+        preserve_detail = detail is None
        existing = existing_by_put_code.get(data["put_code"])
        if existing:
-            for field in [
+            apply_publication_data(existing, data, preserve_detail_if_missing=preserve_detail)
                "title", "subtitle", "type", "journal",
                "pub_year", "pub_month", "pub_day",
                "doi", "url", "short_description",
                "citation_type", "citation_value",
                "language_code", "country",
                "external_ids", "contributors",
            ]:
                setattr(existing, field, data[field])
            existing.last_modified = datetime.utcnow()
            existing.status = None
            publications.append(existing)
@@ -313,6 +331,8 @@ def sync_researcher(
    detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
    detail_budget = detail_cap if enrich_work_details else 0
    detail_attempts = 0
    new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200))
    new_detail_attempts = 0
    for g in groups:
        summaries = g.get("work-summary") or []
@@ -324,26 +344,50 @@ def sync_researcher(
        if put_code is None:
            continue
        existing = existing_by_put_code.get(put_code)
        detail = None
-        if detail_budget > 0 and detail_attempts < detail_budget:
+        fetch_detail = (
            existing is None
            and new_detail_cap > 0
            and new_detail_attempts < new_detail_cap
        )
        if (
            not fetch_detail
            and enrich_work_details
            and detail_budget > 0
            and detail_attempts < detail_budget
        ):
            fetch_detail = True
        if fetch_detail:
            try:
                detail = orcid_client.fetch_work_detail(orcid_id, put_code)
            except Exception:
                detail = None
-            detail_attempts += 1
+            if existing is None:
                new_detail_attempts += 1
            else:
                detail_attempts += 1
        data = PublicationNormalizer.normalize(summary, detail)
-
+        preserve_detail = detail is None
        existing = existing_by_put_code.get(data["put_code"])
        if existing:
-            if publication_changed(existing, data):
+            if publication_changed(
-                for field in data:
+                existing,
-                    setattr(existing, field, data[field])
+                data,
                include_detail_fields=not preserve_detail,
            ):
                apply_publication_data(
                    existing, data, preserve_detail_if_missing=preserve_detail
                )
                existing.last_modified = datetime.utcnow()
                existing.status = "updated"
                updated_count += 1
            else:
                apply_publication_data(
                    existing, data, preserve_detail_if_missing=preserve_detail
                )
                existing.status = "unchanged"
                unchanged_count += 1
@@ -104,6 +104,8 @@ class Settings(BaseSettings):
    # Por defecto solo se usa el resumen de GET /works. Si se pide enrich, como máximo
    # se harán tantas peticiones de detalle (el resto se normaliza solo con summary).
    ORCID_WORK_DETAIL_ENRICH_MAX: int = 50
    # Máximo de obras nuevas (por sync) a las que se pide /work/{put_code}; el resto en export.
    ORCID_NEW_SYNC_DETAIL_MAX: int = 100
    DOCS_ENABLED: bool = True
@@ -98,7 +98,11 @@ class ORCIDClient:
        response = self._http.get(url, headers=self._headers())
        if response.status_code != 200:
            return None
-        return response.json()
+        payload = response.json()
        # ORCID v3 devuelve el work anidado bajo la clave "work".
        if isinstance(payload, dict) and "work" in payload:
            return payload["work"]
        return payload
    # ---------------------------------------------------------
    # OAuth 3-legged (authorization code)
@@ -0,0 +1,152 @@
 """
 Enriquecimiento de publicaciones desde ORCID (/work/{put_code}).
 - Sync rápido: solo resumen de /works; no pisa campos de detalle ya guardados.
 - Publicaciones nuevas en sync: se pide detalle siempre (pocas por ciclo).
 - Exportación: se completa detalle solo de las obras que se van a descargar.
 """
 from __future__ import annotations
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Iterable, List
 from sqlalchemy.orm import Session
 from app.db.models import Publication, Researcher
 from app.services.normalizer import PublicationNormalizer
 from app.services.orcid_client import get_orcid_client
 logger = logging.getLogger("app.services.publication_enrichment")
 # Campos que solo suelen venir del GET /work/{put_code}
 DETAIL_ONLY_FIELDS = (
    "subtitle",
    "citation_type",
    "citation_value",
    "language_code",
    "country",
    "external_ids",
    "contributors",
 )
 UPDATABLE_FIELDS = (
    "title",
    "subtitle",
    "type",
    "journal",
    "pub_year",
    "pub_month",
    "pub_day",
    "doi",
    "url",
    "short_description",
    "citation_type",
    "citation_value",
    "language_code",
    "country",
    "external_ids",
    "contributors",
 )
 def publication_lacks_detail(pub: Publication) -> bool:
    """True si nunca se guardó detalle ORCID (contribuidores/cita/ids extendidos)."""
    return (
        pub.contributors is None
        and pub.citation_value is None
        and pub.external_ids is None
        and pub.subtitle is None
    )
 def apply_publication_data(
    pub: Publication,
    data: dict,
    *,
    preserve_detail_if_missing: bool,
 ) -> None:
    for field in UPDATABLE_FIELDS:
        value = data.get(field)
        if (
            preserve_detail_if_missing
            and field in DETAIL_ONLY_FIELDS
            and value is None
        ):
            continue
        setattr(pub, field, value)
 def _summary_by_put_code(works_payload: dict) -> dict[int, dict]:
    out: dict[int, dict] = {}
    for group in works_payload.get("group", []) or []:
        summaries = group.get("work-summary") or []
        if not summaries:
            continue
        summary = summaries[0]
        put_code = summary.get("put-code")
        if put_code is not None:
            out[int(put_code)] = summary
    return out
 def _fetch_detail_safe(orcid_id: str, put_code: int) -> dict | None:
    try:
        return get_orcid_client().fetch_work_detail(orcid_id, put_code)
    except Exception:
        logger.exception("ORCID work detail failed orcid=%s put_code=%s", orcid_id, put_code)
        return None
 def enrich_publications_from_orcid(
    db: Session,
    researcher: Researcher,
    publications: Iterable[Publication],
    *,
    max_workers: int = 6,
 ) -> None:
    """
    Completa en BD (y en memoria) el detalle ORCID de obras que aún no lo tienen.
    Solo actúa sobre publicaciones pasadas en `publications` (p. ej. las que se exportan).
    """
    to_enrich = [p for p in publications if publication_lacks_detail(p)]
    if not to_enrich:
        return
    orcid_id = researcher.orcid_id
    client = get_orcid_client()
    try:
        works = client.fetch_works(orcid_id)
    except Exception:
        logger.exception("Could not fetch ORCID works for enrichment orcid=%s", orcid_id)
        return
    summaries = _summary_by_put_code(works)
    put_codes = [p.put_code for p in to_enrich if p.put_code in summaries]
    if not put_codes:
        return
    details: dict[int, dict | None] = {}
    workers = max(1, min(max_workers, len(put_codes)))
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {
            pool.submit(_fetch_detail_safe, orcid_id, pc): pc for pc in put_codes
        }
        for fut in as_completed(futures):
            pc = futures[fut]
            details[pc] = fut.result()
    for pub in to_enrich:
        summary = summaries.get(pub.put_code)
        if not summary:
            continue
        detail = details.get(pub.put_code)
        if not detail:
            continue
        data = PublicationNormalizer.normalize(summary, detail)
        apply_publication_data(pub, data, preserve_detail_if_missing=False)
    db.commit()
    for pub in to_enrich:
        db.refresh(pub)