From 9b596af4940bbbcf25dfd04e1afb2b6892b05a9f Mon Sep 17 00:00:00 2001 From: Mireya Cueto Garrido Date: Wed, 20 May 2026 12:56:02 +0200 Subject: [PATCH] feat(backend): detalle ORCID en export y sync sin borrar datos Enriquece obras al exportar SWORD/ZIP, pide detalle en obras nuevas (con tope), preserva campos de detalle en re-sync y desenvuelve la respuesta work de ORCID. --- backend/app/api/export.py | 30 ++++ backend/app/api/researchers.py | 86 +++++++--- backend/app/core/config.py | 2 + backend/app/services/orcid_client.py | 6 +- .../app/services/publication_enrichment.py | 152 ++++++++++++++++++ 5 files changed, 254 insertions(+), 22 deletions(-) create mode 100644 backend/app/services/publication_enrichment.py diff --git a/backend/app/api/export.py b/backend/app/api/export.py index 2e2c295..a280edf 100644 --- a/backend/app/api/export.py +++ b/backend/app/api/export.py @@ -10,6 +10,8 @@ from app.core.rate_limit import limiter from app.db.models import Publication, PublicationDownload, Researcher from app.db.session import get_db from app.security.export_auth import require_export_access +from app.services.orcid_client import get_display_name +from app.services.publication_enrichment import enrich_publications_from_orcid from app.services.sword_generator import SWORDGenerator from app.services.zip_generator import ZIPGenerator from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid @@ -51,6 +53,22 @@ def _record_downloads(db: Session, current: Researcher, pubs: Iterable[Publicati db.commit() +def _prepare_researcher_and_publications_for_export( + db: Session, + researcher: Researcher, + pubs: List[Publication], +) -> None: + """Nombre del investigador y detalle ORCID de obras antes de generar SWORD/ZIP.""" + if not researcher.name: + display_name = get_display_name(researcher.orcid_id) + if display_name: + researcher.name = display_name + db.commit() + db.refresh(researcher) + + enrich_publications_from_orcid(db, researcher, pubs) + + def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]: if len(pub_ids) > settings.MAX_PUB_IDS_BATCH: raise HTTPException(status_code=413, detail="Too many publication IDs") @@ -98,6 +116,10 @@ async def export_multiple_sword( raise HTTPException(status_code=404, detail="No publications found") researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first() + if not researcher: + raise HTTPException(status_code=404, detail="Researcher not found") + + _prepare_researcher_and_publications_for_export(db, researcher, pubs) xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs) if current: @@ -129,6 +151,8 @@ async def export_researcher_sword( if not pubs: raise HTTPException(status_code=404, detail="No publications found for this researcher") + _prepare_researcher_and_publications_for_export(db, researcher, pubs) + xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs) if current: _record_downloads(db, current, pubs) @@ -156,6 +180,10 @@ async def export_multiple_zip( raise HTTPException(status_code=404, detail="No publications found") researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first() + if not researcher: + raise HTTPException(status_code=404, detail="Researcher not found") + + _prepare_researcher_and_publications_for_export(db, researcher, pubs) zip_bytes = ZIPGenerator.generate_zip(researcher, pubs) if current: @@ -187,6 +215,8 @@ async def export_researcher_zip( if not pubs: raise HTTPException(status_code=404, detail="No publications found for this researcher") + _prepare_researcher_and_publications_for_export(db, researcher, pubs) + zip_bytes = ZIPGenerator.generate_zip(researcher, pubs) if current: _record_downloads(db, current, pubs) diff --git a/backend/app/api/researchers.py b/backend/app/api/researchers.py index 377ab4b..9ca3e81 100644 --- a/backend/app/api/researchers.py +++ b/backend/app/api/researchers.py @@ -20,13 +20,19 @@ from app.schema.researcher import ( from app.security.jwt import get_optional_current_researcher from app.services.normalizer import PublicationNormalizer from app.services.orcid_client import get_display_name, get_orcid_client +from app.services.publication_enrichment import DETAIL_ONLY_FIELDS, apply_publication_data from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid router = APIRouter(prefix="/researchers", tags=["researchers"]) -def publication_changed(existing: Publication, data: dict) -> bool: +def publication_changed( + existing: Publication, + data: dict, + *, + include_detail_fields: bool = True, +) -> bool: fields = [ "title", "subtitle", "type", "journal", "pub_year", "pub_month", "pub_day", @@ -35,6 +41,8 @@ def publication_changed(existing: Publication, data: dict) -> bool: "language_code", "country", "external_ids", "contributors", ] + if not include_detail_fields: + fields = [f for f in fields if f not in DETAIL_ONLY_FIELDS] return any(getattr(existing, f) != data[f] for f in fields) @@ -71,6 +79,8 @@ def _upsert_researcher_publications( detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200)) detail_budget = detail_cap if enrich_work_details else 0 detail_attempts = 0 + new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200)) + new_detail_attempts = 0 for g in groups: summaries = g.get("work-summary") or [] @@ -82,28 +92,36 @@ def _upsert_researcher_publications( if put_code is None: continue + existing = existing_by_put_code.get(put_code) detail = None - if detail_budget > 0 and detail_attempts < detail_budget: + fetch_detail = ( + existing is None + and new_detail_cap > 0 + and new_detail_attempts < new_detail_cap + ) + if ( + not fetch_detail + and enrich_work_details + and detail_budget > 0 + and detail_attempts < detail_budget + ): + fetch_detail = True + + if fetch_detail: try: detail = orcid_client.fetch_work_detail(orcid_id, put_code) except Exception: detail = None - detail_attempts += 1 + if existing is None: + new_detail_attempts += 1 + else: + detail_attempts += 1 data = PublicationNormalizer.normalize(summary, detail) - - existing = existing_by_put_code.get(data["put_code"]) + preserve_detail = detail is None if existing: - for field in [ - "title", "subtitle", "type", "journal", - "pub_year", "pub_month", "pub_day", - "doi", "url", "short_description", - "citation_type", "citation_value", - "language_code", "country", - "external_ids", "contributors", - ]: - setattr(existing, field, data[field]) + apply_publication_data(existing, data, preserve_detail_if_missing=preserve_detail) existing.last_modified = datetime.utcnow() existing.status = None publications.append(existing) @@ -313,6 +331,8 @@ def sync_researcher( detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200)) detail_budget = detail_cap if enrich_work_details else 0 detail_attempts = 0 + new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200)) + new_detail_attempts = 0 for g in groups: summaries = g.get("work-summary") or [] @@ -324,26 +344,50 @@ def sync_researcher( if put_code is None: continue + existing = existing_by_put_code.get(put_code) detail = None - if detail_budget > 0 and detail_attempts < detail_budget: + fetch_detail = ( + existing is None + and new_detail_cap > 0 + and new_detail_attempts < new_detail_cap + ) + if ( + not fetch_detail + and enrich_work_details + and detail_budget > 0 + and detail_attempts < detail_budget + ): + fetch_detail = True + + if fetch_detail: try: detail = orcid_client.fetch_work_detail(orcid_id, put_code) except Exception: detail = None - detail_attempts += 1 + if existing is None: + new_detail_attempts += 1 + else: + detail_attempts += 1 data = PublicationNormalizer.normalize(summary, detail) - - existing = existing_by_put_code.get(data["put_code"]) + preserve_detail = detail is None if existing: - if publication_changed(existing, data): - for field in data: - setattr(existing, field, data[field]) + if publication_changed( + existing, + data, + include_detail_fields=not preserve_detail, + ): + apply_publication_data( + existing, data, preserve_detail_if_missing=preserve_detail + ) existing.last_modified = datetime.utcnow() existing.status = "updated" updated_count += 1 else: + apply_publication_data( + existing, data, preserve_detail_if_missing=preserve_detail + ) existing.status = "unchanged" unchanged_count += 1 diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 9d02ef8..814f900 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -104,6 +104,8 @@ class Settings(BaseSettings): # Por defecto solo se usa el resumen de GET /works. Si se pide enrich, como máximo # se harán tantas peticiones de detalle (el resto se normaliza solo con summary). ORCID_WORK_DETAIL_ENRICH_MAX: int = 50 + # Máximo de obras nuevas (por sync) a las que se pide /work/{put_code}; el resto en export. + ORCID_NEW_SYNC_DETAIL_MAX: int = 100 DOCS_ENABLED: bool = True diff --git a/backend/app/services/orcid_client.py b/backend/app/services/orcid_client.py index daeb6b5..1a173fd 100644 --- a/backend/app/services/orcid_client.py +++ b/backend/app/services/orcid_client.py @@ -98,7 +98,11 @@ class ORCIDClient: response = self._http.get(url, headers=self._headers()) if response.status_code != 200: return None - return response.json() + payload = response.json() + # ORCID v3 devuelve el work anidado bajo la clave "work". + if isinstance(payload, dict) and "work" in payload: + return payload["work"] + return payload # --------------------------------------------------------- # OAuth 3-legged (authorization code) diff --git a/backend/app/services/publication_enrichment.py b/backend/app/services/publication_enrichment.py new file mode 100644 index 0000000..7f2d55a --- /dev/null +++ b/backend/app/services/publication_enrichment.py @@ -0,0 +1,152 @@ +""" +Enriquecimiento de publicaciones desde ORCID (/work/{put_code}). + +- Sync rápido: solo resumen de /works; no pisa campos de detalle ya guardados. +- Publicaciones nuevas en sync: se pide detalle siempre (pocas por ciclo). +- Exportación: se completa detalle solo de las obras que se van a descargar. +""" + +from __future__ import annotations + +import logging +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Iterable, List + +from sqlalchemy.orm import Session + +from app.db.models import Publication, Researcher +from app.services.normalizer import PublicationNormalizer +from app.services.orcid_client import get_orcid_client + +logger = logging.getLogger("app.services.publication_enrichment") + +# Campos que solo suelen venir del GET /work/{put_code} +DETAIL_ONLY_FIELDS = ( + "subtitle", + "citation_type", + "citation_value", + "language_code", + "country", + "external_ids", + "contributors", +) + +UPDATABLE_FIELDS = ( + "title", + "subtitle", + "type", + "journal", + "pub_year", + "pub_month", + "pub_day", + "doi", + "url", + "short_description", + "citation_type", + "citation_value", + "language_code", + "country", + "external_ids", + "contributors", +) + + +def publication_lacks_detail(pub: Publication) -> bool: + """True si nunca se guardó detalle ORCID (contribuidores/cita/ids extendidos).""" + return ( + pub.contributors is None + and pub.citation_value is None + and pub.external_ids is None + and pub.subtitle is None + ) + + +def apply_publication_data( + pub: Publication, + data: dict, + *, + preserve_detail_if_missing: bool, +) -> None: + for field in UPDATABLE_FIELDS: + value = data.get(field) + if ( + preserve_detail_if_missing + and field in DETAIL_ONLY_FIELDS + and value is None + ): + continue + setattr(pub, field, value) + + +def _summary_by_put_code(works_payload: dict) -> dict[int, dict]: + out: dict[int, dict] = {} + for group in works_payload.get("group", []) or []: + summaries = group.get("work-summary") or [] + if not summaries: + continue + summary = summaries[0] + put_code = summary.get("put-code") + if put_code is not None: + out[int(put_code)] = summary + return out + + +def _fetch_detail_safe(orcid_id: str, put_code: int) -> dict | None: + try: + return get_orcid_client().fetch_work_detail(orcid_id, put_code) + except Exception: + logger.exception("ORCID work detail failed orcid=%s put_code=%s", orcid_id, put_code) + return None + + +def enrich_publications_from_orcid( + db: Session, + researcher: Researcher, + publications: Iterable[Publication], + *, + max_workers: int = 6, +) -> None: + """ + Completa en BD (y en memoria) el detalle ORCID de obras que aún no lo tienen. + Solo actúa sobre publicaciones pasadas en `publications` (p. ej. las que se exportan). + """ + to_enrich = [p for p in publications if publication_lacks_detail(p)] + if not to_enrich: + return + + orcid_id = researcher.orcid_id + client = get_orcid_client() + try: + works = client.fetch_works(orcid_id) + except Exception: + logger.exception("Could not fetch ORCID works for enrichment orcid=%s", orcid_id) + return + + summaries = _summary_by_put_code(works) + put_codes = [p.put_code for p in to_enrich if p.put_code in summaries] + if not put_codes: + return + + details: dict[int, dict | None] = {} + workers = max(1, min(max_workers, len(put_codes))) + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = { + pool.submit(_fetch_detail_safe, orcid_id, pc): pc for pc in put_codes + } + for fut in as_completed(futures): + pc = futures[fut] + details[pc] = fut.result() + + for pub in to_enrich: + summary = summaries.get(pub.put_code) + if not summary: + continue + detail = details.get(pub.put_code) + if not detail: + continue + data = PublicationNormalizer.normalize(summary, detail) + apply_publication_data(pub, data, preserve_detail_if_missing=False) + + db.commit() + for pub in to_enrich: + db.refresh(pub)