feat(backend): detalle ORCID en export y sync sin borrar datos
Enriquece obras al exportar SWORD/ZIP, pide detalle en obras nuevas (con tope), preserva campos de detalle en re-sync y desenvuelve la respuesta work de ORCID.
This commit is contained in:
@@ -10,6 +10,8 @@ from app.core.rate_limit import limiter
|
|||||||
from app.db.models import Publication, PublicationDownload, Researcher
|
from app.db.models import Publication, PublicationDownload, Researcher
|
||||||
from app.db.session import get_db
|
from app.db.session import get_db
|
||||||
from app.security.export_auth import require_export_access
|
from app.security.export_auth import require_export_access
|
||||||
|
from app.services.orcid_client import get_display_name
|
||||||
|
from app.services.publication_enrichment import enrich_publications_from_orcid
|
||||||
from app.services.sword_generator import SWORDGenerator
|
from app.services.sword_generator import SWORDGenerator
|
||||||
from app.services.zip_generator import ZIPGenerator
|
from app.services.zip_generator import ZIPGenerator
|
||||||
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
||||||
@@ -51,6 +53,22 @@ def _record_downloads(db: Session, current: Researcher, pubs: Iterable[Publicati
|
|||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_researcher_and_publications_for_export(
|
||||||
|
db: Session,
|
||||||
|
researcher: Researcher,
|
||||||
|
pubs: List[Publication],
|
||||||
|
) -> None:
|
||||||
|
"""Nombre del investigador y detalle ORCID de obras antes de generar SWORD/ZIP."""
|
||||||
|
if not researcher.name:
|
||||||
|
display_name = get_display_name(researcher.orcid_id)
|
||||||
|
if display_name:
|
||||||
|
researcher.name = display_name
|
||||||
|
db.commit()
|
||||||
|
db.refresh(researcher)
|
||||||
|
|
||||||
|
enrich_publications_from_orcid(db, researcher, pubs)
|
||||||
|
|
||||||
|
|
||||||
def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]:
|
def _validate_pub_ids(pub_ids: List[UUID]) -> List[UUID]:
|
||||||
if len(pub_ids) > settings.MAX_PUB_IDS_BATCH:
|
if len(pub_ids) > settings.MAX_PUB_IDS_BATCH:
|
||||||
raise HTTPException(status_code=413, detail="Too many publication IDs")
|
raise HTTPException(status_code=413, detail="Too many publication IDs")
|
||||||
@@ -98,6 +116,10 @@ async def export_multiple_sword(
|
|||||||
raise HTTPException(status_code=404, detail="No publications found")
|
raise HTTPException(status_code=404, detail="No publications found")
|
||||||
|
|
||||||
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
|
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
|
||||||
|
if not researcher:
|
||||||
|
raise HTTPException(status_code=404, detail="Researcher not found")
|
||||||
|
|
||||||
|
_prepare_researcher_and_publications_for_export(db, researcher, pubs)
|
||||||
|
|
||||||
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
|
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
|
||||||
if current:
|
if current:
|
||||||
@@ -129,6 +151,8 @@ async def export_researcher_sword(
|
|||||||
if not pubs:
|
if not pubs:
|
||||||
raise HTTPException(status_code=404, detail="No publications found for this researcher")
|
raise HTTPException(status_code=404, detail="No publications found for this researcher")
|
||||||
|
|
||||||
|
_prepare_researcher_and_publications_for_export(db, researcher, pubs)
|
||||||
|
|
||||||
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
|
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
|
||||||
if current:
|
if current:
|
||||||
_record_downloads(db, current, pubs)
|
_record_downloads(db, current, pubs)
|
||||||
@@ -156,6 +180,10 @@ async def export_multiple_zip(
|
|||||||
raise HTTPException(status_code=404, detail="No publications found")
|
raise HTTPException(status_code=404, detail="No publications found")
|
||||||
|
|
||||||
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
|
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
|
||||||
|
if not researcher:
|
||||||
|
raise HTTPException(status_code=404, detail="Researcher not found")
|
||||||
|
|
||||||
|
_prepare_researcher_and_publications_for_export(db, researcher, pubs)
|
||||||
|
|
||||||
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
|
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
|
||||||
if current:
|
if current:
|
||||||
@@ -187,6 +215,8 @@ async def export_researcher_zip(
|
|||||||
if not pubs:
|
if not pubs:
|
||||||
raise HTTPException(status_code=404, detail="No publications found for this researcher")
|
raise HTTPException(status_code=404, detail="No publications found for this researcher")
|
||||||
|
|
||||||
|
_prepare_researcher_and_publications_for_export(db, researcher, pubs)
|
||||||
|
|
||||||
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
|
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
|
||||||
if current:
|
if current:
|
||||||
_record_downloads(db, current, pubs)
|
_record_downloads(db, current, pubs)
|
||||||
|
|||||||
@@ -20,13 +20,19 @@ from app.schema.researcher import (
|
|||||||
from app.security.jwt import get_optional_current_researcher
|
from app.security.jwt import get_optional_current_researcher
|
||||||
from app.services.normalizer import PublicationNormalizer
|
from app.services.normalizer import PublicationNormalizer
|
||||||
from app.services.orcid_client import get_display_name, get_orcid_client
|
from app.services.orcid_client import get_display_name, get_orcid_client
|
||||||
|
from app.services.publication_enrichment import DETAIL_ONLY_FIELDS, apply_publication_data
|
||||||
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/researchers", tags=["researchers"])
|
router = APIRouter(prefix="/researchers", tags=["researchers"])
|
||||||
|
|
||||||
|
|
||||||
def publication_changed(existing: Publication, data: dict) -> bool:
|
def publication_changed(
|
||||||
|
existing: Publication,
|
||||||
|
data: dict,
|
||||||
|
*,
|
||||||
|
include_detail_fields: bool = True,
|
||||||
|
) -> bool:
|
||||||
fields = [
|
fields = [
|
||||||
"title", "subtitle", "type", "journal",
|
"title", "subtitle", "type", "journal",
|
||||||
"pub_year", "pub_month", "pub_day",
|
"pub_year", "pub_month", "pub_day",
|
||||||
@@ -35,6 +41,8 @@ def publication_changed(existing: Publication, data: dict) -> bool:
|
|||||||
"language_code", "country",
|
"language_code", "country",
|
||||||
"external_ids", "contributors",
|
"external_ids", "contributors",
|
||||||
]
|
]
|
||||||
|
if not include_detail_fields:
|
||||||
|
fields = [f for f in fields if f not in DETAIL_ONLY_FIELDS]
|
||||||
return any(getattr(existing, f) != data[f] for f in fields)
|
return any(getattr(existing, f) != data[f] for f in fields)
|
||||||
|
|
||||||
|
|
||||||
@@ -71,6 +79,8 @@ def _upsert_researcher_publications(
|
|||||||
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
|
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
|
||||||
detail_budget = detail_cap if enrich_work_details else 0
|
detail_budget = detail_cap if enrich_work_details else 0
|
||||||
detail_attempts = 0
|
detail_attempts = 0
|
||||||
|
new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200))
|
||||||
|
new_detail_attempts = 0
|
||||||
|
|
||||||
for g in groups:
|
for g in groups:
|
||||||
summaries = g.get("work-summary") or []
|
summaries = g.get("work-summary") or []
|
||||||
@@ -82,28 +92,36 @@ def _upsert_researcher_publications(
|
|||||||
if put_code is None:
|
if put_code is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
existing = existing_by_put_code.get(put_code)
|
||||||
detail = None
|
detail = None
|
||||||
if detail_budget > 0 and detail_attempts < detail_budget:
|
fetch_detail = (
|
||||||
|
existing is None
|
||||||
|
and new_detail_cap > 0
|
||||||
|
and new_detail_attempts < new_detail_cap
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not fetch_detail
|
||||||
|
and enrich_work_details
|
||||||
|
and detail_budget > 0
|
||||||
|
and detail_attempts < detail_budget
|
||||||
|
):
|
||||||
|
fetch_detail = True
|
||||||
|
|
||||||
|
if fetch_detail:
|
||||||
try:
|
try:
|
||||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||||
except Exception:
|
except Exception:
|
||||||
detail = None
|
detail = None
|
||||||
detail_attempts += 1
|
if existing is None:
|
||||||
|
new_detail_attempts += 1
|
||||||
|
else:
|
||||||
|
detail_attempts += 1
|
||||||
|
|
||||||
data = PublicationNormalizer.normalize(summary, detail)
|
data = PublicationNormalizer.normalize(summary, detail)
|
||||||
|
preserve_detail = detail is None
|
||||||
existing = existing_by_put_code.get(data["put_code"])
|
|
||||||
|
|
||||||
if existing:
|
if existing:
|
||||||
for field in [
|
apply_publication_data(existing, data, preserve_detail_if_missing=preserve_detail)
|
||||||
"title", "subtitle", "type", "journal",
|
|
||||||
"pub_year", "pub_month", "pub_day",
|
|
||||||
"doi", "url", "short_description",
|
|
||||||
"citation_type", "citation_value",
|
|
||||||
"language_code", "country",
|
|
||||||
"external_ids", "contributors",
|
|
||||||
]:
|
|
||||||
setattr(existing, field, data[field])
|
|
||||||
existing.last_modified = datetime.utcnow()
|
existing.last_modified = datetime.utcnow()
|
||||||
existing.status = None
|
existing.status = None
|
||||||
publications.append(existing)
|
publications.append(existing)
|
||||||
@@ -313,6 +331,8 @@ def sync_researcher(
|
|||||||
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
|
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
|
||||||
detail_budget = detail_cap if enrich_work_details else 0
|
detail_budget = detail_cap if enrich_work_details else 0
|
||||||
detail_attempts = 0
|
detail_attempts = 0
|
||||||
|
new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200))
|
||||||
|
new_detail_attempts = 0
|
||||||
|
|
||||||
for g in groups:
|
for g in groups:
|
||||||
summaries = g.get("work-summary") or []
|
summaries = g.get("work-summary") or []
|
||||||
@@ -324,26 +344,50 @@ def sync_researcher(
|
|||||||
if put_code is None:
|
if put_code is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
existing = existing_by_put_code.get(put_code)
|
||||||
detail = None
|
detail = None
|
||||||
if detail_budget > 0 and detail_attempts < detail_budget:
|
fetch_detail = (
|
||||||
|
existing is None
|
||||||
|
and new_detail_cap > 0
|
||||||
|
and new_detail_attempts < new_detail_cap
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not fetch_detail
|
||||||
|
and enrich_work_details
|
||||||
|
and detail_budget > 0
|
||||||
|
and detail_attempts < detail_budget
|
||||||
|
):
|
||||||
|
fetch_detail = True
|
||||||
|
|
||||||
|
if fetch_detail:
|
||||||
try:
|
try:
|
||||||
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
|
||||||
except Exception:
|
except Exception:
|
||||||
detail = None
|
detail = None
|
||||||
detail_attempts += 1
|
if existing is None:
|
||||||
|
new_detail_attempts += 1
|
||||||
|
else:
|
||||||
|
detail_attempts += 1
|
||||||
|
|
||||||
data = PublicationNormalizer.normalize(summary, detail)
|
data = PublicationNormalizer.normalize(summary, detail)
|
||||||
|
preserve_detail = detail is None
|
||||||
existing = existing_by_put_code.get(data["put_code"])
|
|
||||||
|
|
||||||
if existing:
|
if existing:
|
||||||
if publication_changed(existing, data):
|
if publication_changed(
|
||||||
for field in data:
|
existing,
|
||||||
setattr(existing, field, data[field])
|
data,
|
||||||
|
include_detail_fields=not preserve_detail,
|
||||||
|
):
|
||||||
|
apply_publication_data(
|
||||||
|
existing, data, preserve_detail_if_missing=preserve_detail
|
||||||
|
)
|
||||||
existing.last_modified = datetime.utcnow()
|
existing.last_modified = datetime.utcnow()
|
||||||
existing.status = "updated"
|
existing.status = "updated"
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
else:
|
else:
|
||||||
|
apply_publication_data(
|
||||||
|
existing, data, preserve_detail_if_missing=preserve_detail
|
||||||
|
)
|
||||||
existing.status = "unchanged"
|
existing.status = "unchanged"
|
||||||
unchanged_count += 1
|
unchanged_count += 1
|
||||||
|
|
||||||
|
|||||||
@@ -104,6 +104,8 @@ class Settings(BaseSettings):
|
|||||||
# Por defecto solo se usa el resumen de GET /works. Si se pide enrich, como máximo
|
# Por defecto solo se usa el resumen de GET /works. Si se pide enrich, como máximo
|
||||||
# se harán tantas peticiones de detalle (el resto se normaliza solo con summary).
|
# se harán tantas peticiones de detalle (el resto se normaliza solo con summary).
|
||||||
ORCID_WORK_DETAIL_ENRICH_MAX: int = 50
|
ORCID_WORK_DETAIL_ENRICH_MAX: int = 50
|
||||||
|
# Máximo de obras nuevas (por sync) a las que se pide /work/{put_code}; el resto en export.
|
||||||
|
ORCID_NEW_SYNC_DETAIL_MAX: int = 100
|
||||||
|
|
||||||
DOCS_ENABLED: bool = True
|
DOCS_ENABLED: bool = True
|
||||||
|
|
||||||
|
|||||||
@@ -98,7 +98,11 @@ class ORCIDClient:
|
|||||||
response = self._http.get(url, headers=self._headers())
|
response = self._http.get(url, headers=self._headers())
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
return None
|
return None
|
||||||
return response.json()
|
payload = response.json()
|
||||||
|
# ORCID v3 devuelve el work anidado bajo la clave "work".
|
||||||
|
if isinstance(payload, dict) and "work" in payload:
|
||||||
|
return payload["work"]
|
||||||
|
return payload
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# ---------------------------------------------------------
|
||||||
# OAuth 3-legged (authorization code)
|
# OAuth 3-legged (authorization code)
|
||||||
|
|||||||
@@ -0,0 +1,152 @@
|
|||||||
|
"""
|
||||||
|
Enriquecimiento de publicaciones desde ORCID (/work/{put_code}).
|
||||||
|
|
||||||
|
- Sync rápido: solo resumen de /works; no pisa campos de detalle ya guardados.
|
||||||
|
- Publicaciones nuevas en sync: se pide detalle siempre (pocas por ciclo).
|
||||||
|
- Exportación: se completa detalle solo de las obras que se van a descargar.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.db.models import Publication, Researcher
|
||||||
|
from app.services.normalizer import PublicationNormalizer
|
||||||
|
from app.services.orcid_client import get_orcid_client
|
||||||
|
|
||||||
|
logger = logging.getLogger("app.services.publication_enrichment")
|
||||||
|
|
||||||
|
# Campos que solo suelen venir del GET /work/{put_code}
|
||||||
|
DETAIL_ONLY_FIELDS = (
|
||||||
|
"subtitle",
|
||||||
|
"citation_type",
|
||||||
|
"citation_value",
|
||||||
|
"language_code",
|
||||||
|
"country",
|
||||||
|
"external_ids",
|
||||||
|
"contributors",
|
||||||
|
)
|
||||||
|
|
||||||
|
UPDATABLE_FIELDS = (
|
||||||
|
"title",
|
||||||
|
"subtitle",
|
||||||
|
"type",
|
||||||
|
"journal",
|
||||||
|
"pub_year",
|
||||||
|
"pub_month",
|
||||||
|
"pub_day",
|
||||||
|
"doi",
|
||||||
|
"url",
|
||||||
|
"short_description",
|
||||||
|
"citation_type",
|
||||||
|
"citation_value",
|
||||||
|
"language_code",
|
||||||
|
"country",
|
||||||
|
"external_ids",
|
||||||
|
"contributors",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def publication_lacks_detail(pub: Publication) -> bool:
|
||||||
|
"""True si nunca se guardó detalle ORCID (contribuidores/cita/ids extendidos)."""
|
||||||
|
return (
|
||||||
|
pub.contributors is None
|
||||||
|
and pub.citation_value is None
|
||||||
|
and pub.external_ids is None
|
||||||
|
and pub.subtitle is None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_publication_data(
|
||||||
|
pub: Publication,
|
||||||
|
data: dict,
|
||||||
|
*,
|
||||||
|
preserve_detail_if_missing: bool,
|
||||||
|
) -> None:
|
||||||
|
for field in UPDATABLE_FIELDS:
|
||||||
|
value = data.get(field)
|
||||||
|
if (
|
||||||
|
preserve_detail_if_missing
|
||||||
|
and field in DETAIL_ONLY_FIELDS
|
||||||
|
and value is None
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
setattr(pub, field, value)
|
||||||
|
|
||||||
|
|
||||||
|
def _summary_by_put_code(works_payload: dict) -> dict[int, dict]:
|
||||||
|
out: dict[int, dict] = {}
|
||||||
|
for group in works_payload.get("group", []) or []:
|
||||||
|
summaries = group.get("work-summary") or []
|
||||||
|
if not summaries:
|
||||||
|
continue
|
||||||
|
summary = summaries[0]
|
||||||
|
put_code = summary.get("put-code")
|
||||||
|
if put_code is not None:
|
||||||
|
out[int(put_code)] = summary
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_detail_safe(orcid_id: str, put_code: int) -> dict | None:
|
||||||
|
try:
|
||||||
|
return get_orcid_client().fetch_work_detail(orcid_id, put_code)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("ORCID work detail failed orcid=%s put_code=%s", orcid_id, put_code)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_publications_from_orcid(
|
||||||
|
db: Session,
|
||||||
|
researcher: Researcher,
|
||||||
|
publications: Iterable[Publication],
|
||||||
|
*,
|
||||||
|
max_workers: int = 6,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Completa en BD (y en memoria) el detalle ORCID de obras que aún no lo tienen.
|
||||||
|
Solo actúa sobre publicaciones pasadas en `publications` (p. ej. las que se exportan).
|
||||||
|
"""
|
||||||
|
to_enrich = [p for p in publications if publication_lacks_detail(p)]
|
||||||
|
if not to_enrich:
|
||||||
|
return
|
||||||
|
|
||||||
|
orcid_id = researcher.orcid_id
|
||||||
|
client = get_orcid_client()
|
||||||
|
try:
|
||||||
|
works = client.fetch_works(orcid_id)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Could not fetch ORCID works for enrichment orcid=%s", orcid_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
summaries = _summary_by_put_code(works)
|
||||||
|
put_codes = [p.put_code for p in to_enrich if p.put_code in summaries]
|
||||||
|
if not put_codes:
|
||||||
|
return
|
||||||
|
|
||||||
|
details: dict[int, dict | None] = {}
|
||||||
|
workers = max(1, min(max_workers, len(put_codes)))
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||||
|
futures = {
|
||||||
|
pool.submit(_fetch_detail_safe, orcid_id, pc): pc for pc in put_codes
|
||||||
|
}
|
||||||
|
for fut in as_completed(futures):
|
||||||
|
pc = futures[fut]
|
||||||
|
details[pc] = fut.result()
|
||||||
|
|
||||||
|
for pub in to_enrich:
|
||||||
|
summary = summaries.get(pub.put_code)
|
||||||
|
if not summary:
|
||||||
|
continue
|
||||||
|
detail = details.get(pub.put_code)
|
||||||
|
if not detail:
|
||||||
|
continue
|
||||||
|
data = PublicationNormalizer.normalize(summary, detail)
|
||||||
|
apply_publication_data(pub, data, preserve_detail_if_missing=False)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
for pub in to_enrich:
|
||||||
|
db.refresh(pub)
|
||||||
Reference in New Issue
Block a user