9b596af494
Enriquece obras al exportar SWORD/ZIP, pide detalle en obras nuevas (con tope), preserva campos de detalle en re-sync y desenvuelve la respuesta work de ORCID.
153 lines
4.2 KiB
Python
153 lines
4.2 KiB
Python
"""
|
|
Enriquecimiento de publicaciones desde ORCID (/work/{put_code}).
|
|
|
|
- Sync rápido: solo resumen de /works; no pisa campos de detalle ya guardados.
|
|
- Publicaciones nuevas en sync: se pide detalle siempre (pocas por ciclo).
|
|
- Exportación: se completa detalle solo de las obras que se van a descargar.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from typing import Iterable, List
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.db.models import Publication, Researcher
|
|
from app.services.normalizer import PublicationNormalizer
|
|
from app.services.orcid_client import get_orcid_client
|
|
|
|
logger = logging.getLogger("app.services.publication_enrichment")
|
|
|
|
# Campos que solo suelen venir del GET /work/{put_code}
|
|
DETAIL_ONLY_FIELDS = (
|
|
"subtitle",
|
|
"citation_type",
|
|
"citation_value",
|
|
"language_code",
|
|
"country",
|
|
"external_ids",
|
|
"contributors",
|
|
)
|
|
|
|
UPDATABLE_FIELDS = (
|
|
"title",
|
|
"subtitle",
|
|
"type",
|
|
"journal",
|
|
"pub_year",
|
|
"pub_month",
|
|
"pub_day",
|
|
"doi",
|
|
"url",
|
|
"short_description",
|
|
"citation_type",
|
|
"citation_value",
|
|
"language_code",
|
|
"country",
|
|
"external_ids",
|
|
"contributors",
|
|
)
|
|
|
|
|
|
def publication_lacks_detail(pub: Publication) -> bool:
|
|
"""True si nunca se guardó detalle ORCID (contribuidores/cita/ids extendidos)."""
|
|
return (
|
|
pub.contributors is None
|
|
and pub.citation_value is None
|
|
and pub.external_ids is None
|
|
and pub.subtitle is None
|
|
)
|
|
|
|
|
|
def apply_publication_data(
|
|
pub: Publication,
|
|
data: dict,
|
|
*,
|
|
preserve_detail_if_missing: bool,
|
|
) -> None:
|
|
for field in UPDATABLE_FIELDS:
|
|
value = data.get(field)
|
|
if (
|
|
preserve_detail_if_missing
|
|
and field in DETAIL_ONLY_FIELDS
|
|
and value is None
|
|
):
|
|
continue
|
|
setattr(pub, field, value)
|
|
|
|
|
|
def _summary_by_put_code(works_payload: dict) -> dict[int, dict]:
|
|
out: dict[int, dict] = {}
|
|
for group in works_payload.get("group", []) or []:
|
|
summaries = group.get("work-summary") or []
|
|
if not summaries:
|
|
continue
|
|
summary = summaries[0]
|
|
put_code = summary.get("put-code")
|
|
if put_code is not None:
|
|
out[int(put_code)] = summary
|
|
return out
|
|
|
|
|
|
def _fetch_detail_safe(orcid_id: str, put_code: int) -> dict | None:
|
|
try:
|
|
return get_orcid_client().fetch_work_detail(orcid_id, put_code)
|
|
except Exception:
|
|
logger.exception("ORCID work detail failed orcid=%s put_code=%s", orcid_id, put_code)
|
|
return None
|
|
|
|
|
|
def enrich_publications_from_orcid(
|
|
db: Session,
|
|
researcher: Researcher,
|
|
publications: Iterable[Publication],
|
|
*,
|
|
max_workers: int = 6,
|
|
) -> None:
|
|
"""
|
|
Completa en BD (y en memoria) el detalle ORCID de obras que aún no lo tienen.
|
|
Solo actúa sobre publicaciones pasadas en `publications` (p. ej. las que se exportan).
|
|
"""
|
|
to_enrich = [p for p in publications if publication_lacks_detail(p)]
|
|
if not to_enrich:
|
|
return
|
|
|
|
orcid_id = researcher.orcid_id
|
|
client = get_orcid_client()
|
|
try:
|
|
works = client.fetch_works(orcid_id)
|
|
except Exception:
|
|
logger.exception("Could not fetch ORCID works for enrichment orcid=%s", orcid_id)
|
|
return
|
|
|
|
summaries = _summary_by_put_code(works)
|
|
put_codes = [p.put_code for p in to_enrich if p.put_code in summaries]
|
|
if not put_codes:
|
|
return
|
|
|
|
details: dict[int, dict | None] = {}
|
|
workers = max(1, min(max_workers, len(put_codes)))
|
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
futures = {
|
|
pool.submit(_fetch_detail_safe, orcid_id, pc): pc for pc in put_codes
|
|
}
|
|
for fut in as_completed(futures):
|
|
pc = futures[fut]
|
|
details[pc] = fut.result()
|
|
|
|
for pub in to_enrich:
|
|
summary = summaries.get(pub.put_code)
|
|
if not summary:
|
|
continue
|
|
detail = details.get(pub.put_code)
|
|
if not detail:
|
|
continue
|
|
data = PublicationNormalizer.normalize(summary, detail)
|
|
apply_publication_data(pub, data, preserve_detail_if_missing=False)
|
|
|
|
db.commit()
|
|
for pub in to_enrich:
|
|
db.refresh(pub)
|