Files
ORCID2SWORD/backend/app/api/researchers.py
T
Mireya Cueto Garrido 9b596af494 feat(backend): detalle ORCID en export y sync sin borrar datos
Enriquece obras al exportar SWORD/ZIP, pide detalle en obras nuevas (con tope), preserva campos de detalle en re-sync y desenvuelve la respuesta work de ORCID.
2026-05-20 12:56:02 +02:00

423 lines
13 KiB
Python

from datetime import datetime
from typing import List
import httpx
from fastapi import APIRouter, Depends, HTTPException, Path, Query, Request
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.rate_limit import limiter
from app.db.models import Publication, PublicationDownload, Researcher
from app.db.session import get_db
from app.schema.publication import PublicationSchema
from app.schema.researcher import (
ResearcherBatchSearchRequestSchema,
ResearcherBatchSearchResponseSchema,
ResearcherSearchErrorSchema,
ResearcherStatsSchema,
ResearcherWithPublicationsSchema,
)
from app.security.jwt import get_optional_current_researcher
from app.services.normalizer import PublicationNormalizer
from app.services.orcid_client import get_display_name, get_orcid_client
from app.services.publication_enrichment import DETAIL_ONLY_FIELDS, apply_publication_data
from app.utils.orcid_validator import ORCID_PATTERN, is_valid_orcid
router = APIRouter(prefix="/researchers", tags=["researchers"])
def publication_changed(
existing: Publication,
data: dict,
*,
include_detail_fields: bool = True,
) -> bool:
fields = [
"title", "subtitle", "type", "journal",
"pub_year", "pub_month", "pub_day",
"doi", "url", "short_description",
"citation_type", "citation_value",
"language_code", "country",
"external_ids", "contributors",
]
if not include_detail_fields:
fields = [f for f in fields if f not in DETAIL_ONLY_FIELDS]
return any(getattr(existing, f) != data[f] for f in fields)
def build_researcher_stats(publications: list) -> ResearcherStatsSchema:
publication_types: dict[str, int] = {}
for publication in publications:
pub_type = getattr(publication, "type", None) or "unknown"
publication_types[pub_type] = publication_types.get(pub_type, 0) + 1
return ResearcherStatsSchema(
total_publications=len(publications),
publication_types=publication_types,
)
def _upsert_researcher_publications(
researcher: Researcher,
orcid_id: str,
db: Session,
*,
enrich_work_details: bool = False,
) -> List[Publication]:
orcid_client = get_orcid_client()
works = orcid_client.fetch_works(orcid_id)
groups = works.get("group", [])
publications: List[Publication] = []
existing_by_put_code = {
publication.put_code: publication
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
}
# Tope duro para evitar timeouts y abuso aunque el .env pida un valor enorme.
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
detail_budget = detail_cap if enrich_work_details else 0
detail_attempts = 0
new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200))
new_detail_attempts = 0
for g in groups:
summaries = g.get("work-summary") or []
if not summaries:
continue
summary = summaries[0]
put_code = summary.get("put-code")
if put_code is None:
continue
existing = existing_by_put_code.get(put_code)
detail = None
fetch_detail = (
existing is None
and new_detail_cap > 0
and new_detail_attempts < new_detail_cap
)
if (
not fetch_detail
and enrich_work_details
and detail_budget > 0
and detail_attempts < detail_budget
):
fetch_detail = True
if fetch_detail:
try:
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
except Exception:
detail = None
if existing is None:
new_detail_attempts += 1
else:
detail_attempts += 1
data = PublicationNormalizer.normalize(summary, detail)
preserve_detail = detail is None
if existing:
apply_publication_data(existing, data, preserve_detail_if_missing=preserve_detail)
existing.last_modified = datetime.utcnow()
existing.status = None
publications.append(existing)
else:
pub = Publication(
researcher_id=researcher.id,
**data,
last_modified=datetime.utcnow(),
)
pub.status = None
db.add(pub)
publications.append(pub)
existing_by_put_code[data["put_code"]] = pub
researcher.last_sync_at = datetime.utcnow()
db.commit()
db.refresh(researcher)
return publications
def _decorate_downloaded_by_me(
*,
db: Session,
current: Researcher | None,
publications: List[Publication],
) -> List[PublicationSchema] | List[Publication]:
if not current:
return publications
downloaded_ids = {
row[0]
for row in (
db.query(PublicationDownload.publication_id)
.filter(PublicationDownload.researcher_id == current.id)
.all()
)
}
out: List[PublicationSchema] = []
for p in publications:
out.append(
PublicationSchema.model_validate(p).model_copy(
update={"downloaded_by_me": p.id in downloaded_ids}
)
)
return out
def build_search_response(
orcid_id: str,
db: Session,
current: Researcher | None,
*,
enrich_work_details: bool = False,
) -> ResearcherWithPublicationsSchema:
if not is_valid_orcid(orcid_id):
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
researcher = db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first()
if not researcher:
researcher = Researcher(
orcid_id=orcid_id,
name=None,
authenticated=False,
last_sync_at=None,
)
db.add(researcher)
db.flush()
if not researcher.name:
display_name = get_display_name(orcid_id)
if display_name:
researcher.name = display_name
db.flush()
publications = _upsert_researcher_publications(
researcher,
orcid_id,
db,
enrich_work_details=enrich_work_details,
)
publications_out = _decorate_downloaded_by_me(db=db, current=current, publications=publications)
stats = build_researcher_stats(publications_out)
return ResearcherWithPublicationsSchema(
researcher=researcher,
publications=publications_out,
stats=stats,
new_records=0,
updated_records=0,
unchanged_records=0,
total_records=len(publications_out),
)
# ---------------------------------------------------------
# ENDPOINT 1: SEARCH + SYNC
# ---------------------------------------------------------
@router.post(
"/search",
response_model=ResearcherBatchSearchResponseSchema,
response_model_exclude_none=True,
)
@limiter.limit(settings.RATE_LIMIT_SEARCH_ANON)
def search_and_sync_researchers(
request: Request,
payload: ResearcherBatchSearchRequestSchema,
db: Session = Depends(get_db),
current: Researcher | None = Depends(get_optional_current_researcher),
):
results: List[ResearcherWithPublicationsSchema] = []
errors: List[ResearcherSearchErrorSchema] = []
unique_orcid_ids = list(dict.fromkeys(payload.orcid_ids))
for orcid_id in unique_orcid_ids:
try:
results.append(
build_search_response(
orcid_id,
db,
current,
enrich_work_details=payload.enrich_work_details,
)
)
except HTTPException as exc:
db.rollback()
errors.append(
ResearcherSearchErrorSchema(
orcid_id=orcid_id,
detail=str(exc.detail),
)
)
except httpx.HTTPStatusError as exc:
db.rollback()
errors.append(
ResearcherSearchErrorSchema(
orcid_id=orcid_id,
detail=f"ORCID returned {exc.response.status_code}",
)
)
except Exception:
db.rollback()
errors.append(
ResearcherSearchErrorSchema(
orcid_id=orcid_id,
detail="Unexpected error while processing ORCID iD",
)
)
return ResearcherBatchSearchResponseSchema(
results=results,
errors=errors,
total_requested=len(unique_orcid_ids),
total_processed=len(results),
)
# ---------------------------------------------------------
# ENDPOINT 2: SYNC COMPLETO (requiere autenticación)
# ---------------------------------------------------------
@router.post(
"/{orcid_id}/sync",
response_model=ResearcherWithPublicationsSchema,
response_model_exclude_none=True,
)
@limiter.limit(settings.RATE_LIMIT_SYNC)
def sync_researcher(
request: Request,
orcid_id: str = Path(min_length=19, max_length=19, pattern=ORCID_PATTERN),
enrich_work_details: bool = Query(
False,
description=(
"Si es true, consulta ORCID GET /work/{put_code} hasta min(ORCID_WORK_DETAIL_ENRICH_MAX, 200) "
"veces por perfil; el resto usa solo el resumen de /works. Por defecto false (recomendado para "
"perfiles con muchas publicaciones)."
),
),
db: Session = Depends(get_db),
current: Researcher | None = Depends(get_optional_current_researcher),
):
if not is_valid_orcid(orcid_id):
raise HTTPException(status_code=400, detail="Invalid ORCID iD")
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
orcid_client = get_orcid_client()
works = orcid_client.fetch_works(orcid_id)
groups = works.get("group", [])
publications_output = []
new_count = 0
updated_count = 0
unchanged_count = 0
existing_by_put_code = {
publication.put_code: publication
for publication in db.query(Publication).filter(Publication.researcher_id == researcher.id).all()
}
# Tope duro para evitar timeouts y abuso aunque el .env pida un valor enorme.
detail_cap = max(0, min(settings.ORCID_WORK_DETAIL_ENRICH_MAX, 200))
detail_budget = detail_cap if enrich_work_details else 0
detail_attempts = 0
new_detail_cap = max(0, min(settings.ORCID_NEW_SYNC_DETAIL_MAX, 200))
new_detail_attempts = 0
for g in groups:
summaries = g.get("work-summary") or []
if not summaries:
continue
summary = summaries[0]
put_code = summary.get("put-code")
if put_code is None:
continue
existing = existing_by_put_code.get(put_code)
detail = None
fetch_detail = (
existing is None
and new_detail_cap > 0
and new_detail_attempts < new_detail_cap
)
if (
not fetch_detail
and enrich_work_details
and detail_budget > 0
and detail_attempts < detail_budget
):
fetch_detail = True
if fetch_detail:
try:
detail = orcid_client.fetch_work_detail(orcid_id, put_code)
except Exception:
detail = None
if existing is None:
new_detail_attempts += 1
else:
detail_attempts += 1
data = PublicationNormalizer.normalize(summary, detail)
preserve_detail = detail is None
if existing:
if publication_changed(
existing,
data,
include_detail_fields=not preserve_detail,
):
apply_publication_data(
existing, data, preserve_detail_if_missing=preserve_detail
)
existing.last_modified = datetime.utcnow()
existing.status = "updated"
updated_count += 1
else:
apply_publication_data(
existing, data, preserve_detail_if_missing=preserve_detail
)
existing.status = "unchanged"
unchanged_count += 1
pub = existing
else:
pub = Publication(
researcher_id=researcher.id,
**data,
last_modified=datetime.utcnow(),
)
pub.status = "new"
db.add(pub)
new_count += 1
existing_by_put_code[data["put_code"]] = pub
publications_output.append(pub)
researcher.last_sync_at = datetime.utcnow()
db.commit()
db.refresh(researcher)
publications_out = _decorate_downloaded_by_me(db=db, current=current, publications=publications_output)
return ResearcherWithPublicationsSchema(
researcher=researcher,
publications=publications_out,
stats=build_researcher_stats(publications_out),
new_records=new_count,
updated_records=updated_count,
unchanged_records=unchanged_count,
total_records=new_count + updated_count + unchanged_count,
)