Versión 3 Backend - Endpoints finales corregidos

This commit is contained in:
Mireya Cueto Garrido
2026-04-27 13:39:32 +02:00
parent a286c2e3ae
commit 96f01c0126
4343 changed files with 1046097 additions and 465 deletions
+101
View File
@@ -0,0 +1,101 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from sqlalchemy.orm import Session
from uuid import UUID
from app.db.session import get_db
from app.db.models import Publication, Researcher
from app.security.api_key import get_api_key
from app.services.sword_generator import SWORDGenerator
from app.services.zip_generator import ZIPGenerator
router = APIRouter(prefix="/export")
def validate_uuid_list(pub_ids: list[str]) -> list[UUID]:
valid_ids = []
for pid in pub_ids:
try:
valid_ids.append(UUID(pid))
except Exception:
raise HTTPException(
status_code=400,
detail=f"Invalid publication ID (not UUID): {pid}"
)
return valid_ids
@router.post("/sword/publications")
async def export_multiple_sword(
pub_ids: list[str],
db: Session = Depends(get_db),
api_key: str = Depends(get_api_key)
):
validate_uuid_list(pub_ids)
pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found")
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
return Response(content=xml_bytes, media_type="application/xml")
@router.get("/sword/researcher/{orcid_id}")
async def export_researcher_sword(
orcid_id: str,
db: Session = Depends(get_db),
api_key: str = Depends(get_api_key)
):
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = db.query(Publication).filter_by(researcher_id=researcher.id).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found for this researcher")
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, pubs)
return Response(content=xml_bytes, media_type="application/xml")
@router.post("/zip/publications")
async def export_multiple_zip(
pub_ids: list[str],
db: Session = Depends(get_db),
api_key: str = Depends(get_api_key)
):
validate_uuid_list(pub_ids)
pubs = db.query(Publication).filter(Publication.id.in_(pub_ids)).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found")
researcher = db.query(Researcher).filter_by(id=pubs[0].researcher_id).first()
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
return Response(content=zip_bytes, media_type="application/zip")
@router.get("/zip/researcher/{orcid_id}")
async def export_researcher_zip(
orcid_id: str,
db: Session = Depends(get_db),
api_key: str = Depends(get_api_key)
):
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = db.query(Publication).filter_by(researcher_id=researcher.id).all()
if not pubs:
raise HTTPException(status_code=404, detail="No publications found for this researcher")
zip_bytes = ZIPGenerator.generate_zip(researcher, pubs)
return Response(content=zip_bytes, media_type="application/zip")
+189 -101
View File
@@ -1,120 +1,208 @@
from datetime import datetime
from typing import List
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from sqlalchemy.orm import Session
from app.schema.publication import PublicationSchema
from app.db.models import Publication, Researcher
from app.db.session import get_db
from app.repositories.researcher_repository import ResearcherRepository
from app.repositories.publication_repository import PublicationRepository
from app.services.sync_service import SyncService
from app.services.sword_exporter import SWORDExporter
from app.utils.orcid_validator import is_valid_orcid
from app.schema.researcher import ResearcherWithPublicationsSchema
from app.services.normalizer import PublicationNormalizer
from app.services.orcid_client import get_works_summary, get_work_detail
router = APIRouter(prefix="/researchers", tags=["researchers"])
def validate_orcid_or_400(orcid_id: str):
if not is_valid_orcid(orcid_id):
raise HTTPException(
status_code=400,
detail=f"ORCID ID '{orcid_id}' no es válido según el formato y dígito de control."
# ---------------------------------------------------------
# Función auxiliar: detectar si una publicación ha cambiado
# ---------------------------------------------------------
def publication_changed(existing: Publication, data: dict) -> bool:
fields = [
"title", "subtitle", "type", "journal",
"pub_year", "pub_month", "pub_day",
"doi", "url", "short_description",
"citation_type", "citation_value",
"language_code", "country",
"external_ids", "contributors"
]
for f in fields:
if getattr(existing, f) != data[f]:
return True
return False
# ---------------------------------------------------------
# ENDPOINT 1: SEARCH + SYNC (sin contadores)
# ---------------------------------------------------------
@router.get("/search/{orcid_id}", response_model=ResearcherWithPublicationsSchema)
def search_and_sync_researcher(orcid_id: str, db: Session = Depends(get_db)):
# Buscar o crear Researcher
researcher = db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first()
if not researcher:
researcher = Researcher(
orcid_id=orcid_id,
name=None,
authenticated=False,
last_sync_at=None,
)
db.add(researcher)
db.flush()
# Obtener works summary desde ORCID
works = get_works_summary(orcid_id)
groups = works.get("group", [])
publications: List[Publication] = []
for g in groups:
summaries = g.get("work-summary") or []
if not summaries:
continue
summary = summaries[0]
put_code = summary.get("put-code")
if put_code is None:
continue
# Obtener detalle del work
try:
detail = get_work_detail(orcid_id, put_code)
except Exception:
detail = None
# Normalizar datos
data = PublicationNormalizer.normalize(summary, detail)
# Ver si ya existe la publicación
existing = (
db.query(Publication)
.filter(
Publication.researcher_id == researcher.id,
Publication.put_code == data["put_code"],
)
.first()
)
if existing:
for field in [
"title", "subtitle", "type", "journal",
"pub_year", "pub_month", "pub_day",
"doi", "url", "short_description",
"citation_type", "citation_value",
"language_code", "country",
"external_ids", "contributors"
]:
setattr(existing, field, data[field])
existing.last_modified = datetime.utcnow()
existing.status = None
publications.append(existing)
else:
pub = Publication(
researcher_id=researcher.id,
**data,
last_modified=datetime.utcnow(),
)
pub.status = None
db.add(pub)
publications.append(pub)
@router.post("/", response_model=dict)
def create_researcher(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher.last_sync_at = datetime.utcnow()
db.commit()
db.refresh(researcher)
existing = ResearcherRepository.get_by_orcid(db, orcid_id)
if existing:
return {
"status": "ok",
"message": "Researcher ya existe.",
"orcid_id": existing.orcid_id,
"id": existing.id
}
# Aquí podrías opcionalmente validar que el ORCID existe en ORCID API
researcher = ResearcherRepository.create(db, orcid_id, name=None)
return {
"status": "ok",
"message": "Researcher creado correctamente.",
"orcid_id": researcher.orcid_id,
"id": researcher.id
}
return ResearcherWithPublicationsSchema(
researcher=researcher,
publications=publications,
new_records=0,
updated_records=0,
unchanged_records=0,
total_records=len(publications),
)
@router.get("/{orcid_id}", response_model=dict)
def get_researcher(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
return {
"orcid_id": researcher.orcid_id,
"name": researcher.name,
"authenticated": researcher.authenticated,
"access_token": researcher.access_token,
"id": researcher.id,
"last_sync_at": researcher.last_sync_at,
}
@router.post("/{orcid_id}/sync", response_model=dict)
# ---------------------------------------------------------
# ENDPOINT 2: SYNC COMPLETO (con contadores + status)
# ---------------------------------------------------------
@router.post("/{orcid_id}/sync", response_model=ResearcherWithPublicationsSchema)
def sync_researcher(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
service = SyncService()
result = service.sync_researcher(db, orcid_id)
return result
@router.get("/{orcid_id}/publications", response_model=list[PublicationSchema], tags=["researchers"])
def get_publications(orcid_id: str, db: Session = Depends(get_db)):
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
return researcher.publications
@router.get("/{orcid_id}/export/sword.xml")
def export_sword_xml(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
researcher = db.query(Researcher).filter_by(orcid_id=orcid_id).first()
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = PublicationRepository.list_by_researcher(db, researcher.id)
xml_bytes = SWORDExporter.export_feed_xml(researcher, pubs)
works = get_works_summary(orcid_id)
groups = works.get("group", [])
return Response(
content=xml_bytes,
media_type="application/xml",
headers={
"Content-Disposition": f'attachment; filename="sword_{orcid_id}.xml"'
}
)
@router.get("/{orcid_id}/export/sword.zip")
def export_sword_zip(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = PublicationRepository.list_by_researcher(db, researcher.id)
zip_bytes = SWORDExporter.export_zip(researcher, pubs)
return Response(
content=zip_bytes,
media_type="application/zip",
headers={
"Content-Disposition": f'attachment; filename="sword_{orcid_id}.zip"'
}
publications_output = []
new_count = 0
updated_count = 0
unchanged_count = 0
for g in groups:
summaries = g.get("work-summary") or []
if not summaries:
continue
summary = summaries[0]
put_code = summary.get("put-code")
if put_code is None:
continue
try:
detail = get_work_detail(orcid_id, put_code)
except Exception:
detail = None
data = PublicationNormalizer.normalize(summary, detail)
existing = (
db.query(Publication)
.filter(
Publication.researcher_id == researcher.id,
Publication.put_code == data["put_code"],
)
.first()
)
if existing:
if publication_changed(existing, data):
# updated
for field in data:
setattr(existing, field, data[field])
existing.last_modified = datetime.utcnow()
existing.status = "updated"
updated_count += 1
else:
# unchanged
existing.status = "unchanged"
unchanged_count += 1
pub = existing
else:
# new
pub = Publication(
researcher_id=researcher.id,
**data,
last_modified=datetime.utcnow(),
)
pub.status = "new"
db.add(pub)
new_count += 1
db.flush()
publications_output.append(pub)
researcher.last_sync_at = datetime.utcnow()
db.commit()
db.refresh(researcher)
return ResearcherWithPublicationsSchema(
researcher=researcher,
publications=publications_output,
new_records=new_count,
updated_records=updated_count,
unchanged_records=unchanged_count,
total_records=new_count + updated_count + unchanged_count,
)
+3
View File
@@ -0,0 +1,3 @@
from sqlalchemy.orm import declarative_base
Base = declarative_base()
+41 -38
View File
@@ -1,60 +1,63 @@
from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text, ForeignKey
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.sql import func
from sqlalchemy import Column, String, Integer, Boolean, DateTime, ForeignKey
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.orm import relationship
from .session import Base
import uuid
from datetime import datetime
from app.db.session import Base
class Researcher(Base):
__tablename__ = "researchers"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
orcid_id = Column(String(19), unique=True, nullable=False)
name = Column(Text)
orcid_id = Column(String, unique=True, index=True, nullable=False)
name = Column(String, nullable=True)
authenticated = Column(Boolean, default=False)
access_token = Column(Text, nullable=True)
last_sync_at = Column(DateTime(timezone=True), server_default=func.now())
last_sync_at = Column(DateTime, nullable=True)
publications = relationship(
"Publication",
back_populates="researcher",
cascade="all, delete-orphan"
)
sync_jobs = relationship(
"SyncJob",
back_populates="researcher",
cascade="all, delete-orphan"
)
publications = relationship("Publication", back_populates="researcher", cascade="all, delete-orphan")
class Publication(Base):
__tablename__ = "publications"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"))
put_code = Column(Integer)
title = Column(Text)
journal = Column(Text)
doi = Column(Text)
pub_year = Column(Integer)
type = Column(Text)
hash_fingerprint = Column(Text)
last_modified = Column(DateTime(timezone=True))
researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"), nullable=False)
researcher = relationship("Researcher", back_populates="publications")
# ORCID core
put_code = Column(Integer, index=True, nullable=False)
title = Column(String, nullable=True)
subtitle = Column(String, nullable=True)
type = Column(String, nullable=True)
class SyncJob(Base):
__tablename__ = "sync_jobs"
# Journal / container
journal = Column(String, nullable=True)
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"))
status = Column(String(20))
new_records = Column(Integer, default=0)
updated_records = Column(Integer, default=0)
started_at = Column(DateTime(timezone=True), server_default=func.now())
finished_at = Column(DateTime(timezone=True))
# Dates
pub_year = Column(Integer, nullable=True)
pub_month = Column(Integer, nullable=True)
pub_day = Column(Integer, nullable=True)
researcher = relationship("Researcher", back_populates="sync_jobs")
# Identifiers / links
doi = Column(String, nullable=True)
url = Column(String, nullable=True)
# Description / citation
short_description = Column(String, nullable=True)
citation_type = Column(String, nullable=True)
citation_value = Column(String, nullable=True)
# Language / country
language_code = Column(String, nullable=True)
country = Column(String, nullable=True)
# Extra structured data
external_ids = Column(JSONB, nullable=True) # lista de external-id normalizados
contributors = Column(JSONB, nullable=True) # lista de autores/roles
# Tu campo existente
hash_fingerprint = Column(String, nullable=True)
last_modified = Column(DateTime, nullable=True, default=None)
@@ -1,7 +1,6 @@
from sqlalchemy.orm import Session
from app.db.models import Publication
class PublicationRepository:
@staticmethod
+18
View File
@@ -2,6 +2,9 @@ from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, declarative_base
import os
# -----------------------------
# DATABASE URL
# -----------------------------
DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(
@@ -18,9 +21,24 @@ SessionLocal = sessionmaker(
Base = declarative_base()
# -----------------------------
# DB SESSION DEPENDENCY
# -----------------------------
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
# -----------------------------
# INIT DB (CREA TABLAS)
# -----------------------------
def init_db():
# Importa modelos para que SQLAlchemy los registre
import app.db.models # noqa
# Crea todas las tablas si no existen
Base.metadata.create_all(bind=engine)
+24 -3
View File
@@ -1,8 +1,15 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.db.session import init_db
from app.api.researchers import router as researchers_router
from app.db.session import Base, engine
from app.api.export import router as export_router
from app.scheduler.sync_scheduler import start_scheduler
# ---------------------------------------------------------
# Crear instancia principal de FastAPI
# ---------------------------------------------------------
app = FastAPI(
title="ORCID SWORD Backend",
description="Backend para sincronización ORCID y exportación SWORD",
@@ -15,7 +22,8 @@ app = FastAPI(
# ---------------------------------------------------------
@app.on_event("startup")
def startup_event():
Base.metadata.create_all(bind=engine)
init_db() # 🔥 CREA TABLAS
start_scheduler() # 🔥 INICIA SCHEDULER
# ---------------------------------------------------------
@@ -29,4 +37,17 @@ def health():
# ---------------------------------------------------------
# Registrar routers
# ---------------------------------------------------------
app.include_router(researchers_router)
app.include_router(researchers_router, prefix="/api")
app.include_router(export_router, prefix="/api")
# ---------------------------------------------------------
# CORS
# ---------------------------------------------------------
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # en producción limitar
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
+43
View File
@@ -0,0 +1,43 @@
import requests
from apscheduler.schedulers.background import BackgroundScheduler
from app.db.session import SessionLocal
from app.db.repositories.researcher_repository import ResearcherRepository
from dotenv import load_dotenv
import os
# Cargar variables del .env
load_dotenv()
API_KEY = os.getenv("API_KEY_VALUE")
BASE_URL = os.getenv("BASE_URL")
def run_monthly_sync():
db = SessionLocal()
researchers = ResearcherRepository.get_all(db)
for r in researchers:
try:
url = f"{BASE_URL}/researchers/{r.orcid_id}/sync"
response = requests.post(
url,
headers={"X-API-Key": API_KEY}
)
if response.status_code != 200:
print(f"[ERROR] Sync failed for {r.orcid_id}: {response.text}")
else:
print(f"[OK] Synced {r.orcid_id}")
except Exception as e:
print(f"[EXCEPTION] Error syncing {r.orcid_id}: {e}")
db.close()
def start_scheduler():
scheduler = BackgroundScheduler()
scheduler.add_job(run_monthly_sync, "cron", day=1, hour=3) # día 1 a las 03:00
scheduler.start()
+16 -2
View File
@@ -1,16 +1,30 @@
from pydantic import BaseModel
from uuid import UUID
from typing import Optional, List, Any
from datetime import datetime
class PublicationSchema(BaseModel):
id: UUID
put_code: int | None = None
title: str
title: str | None = None
subtitle: str | None = None
journal: str | None = None
doi: str | None = None
pub_year: int | None = None
pub_month: int | None = None
pub_day: int | None = None
type: str | None = None
url: str | None = None
short_description: str | None = None
citation_type: str | None = None
citation_value: str | None = None
language_code: str | None = None
country: str | None = None
external_ids: List[Any] | None = None
contributors: List[Any] | None = None
hash_fingerprint: str | None = None
last_modified: str | None = None
last_modified: datetime | None = None
status: str | None = None
class Config:
from_attributes = True
+27
View File
@@ -0,0 +1,27 @@
from pydantic import BaseModel
from uuid import UUID
from typing import Optional, List
from datetime import datetime
from app.schema.publication import PublicationSchema
class ResearcherSchema(BaseModel):
id: UUID
orcid_id: str
name: Optional[str]
authenticated: bool
last_sync_at: Optional[datetime]
model_config = {"from_attributes": True}
class ResearcherWithPublicationsSchema(BaseModel):
researcher: ResearcherSchema
publications: List[PublicationSchema]
# NUEVOS CAMPOS
new_records: int
updated_records: int
unchanged_records: int
total_records: int
model_config = {"from_attributes": True}
+27
View File
@@ -0,0 +1,27 @@
import os
from dotenv import load_dotenv
from fastapi import Depends, HTTPException, status
from fastapi.security import APIKeyHeader
# Cargar variables del .env
load_dotenv()
API_KEY_NAME = os.getenv("API_KEY_NAME")
API_KEY_VALUE = os.getenv("API_KEY_VALUE")
if not API_KEY_NAME:
raise RuntimeError("ERROR: La variable API_KEY_NAME no está definida en el .env")
if not API_KEY_VALUE:
raise RuntimeError("ERROR: La variable API_KEY_VALUE no está definida en el .env")
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
def get_api_key(api_key: str = Depends(api_key_header)):
if api_key != API_KEY_VALUE:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="API key inválida o ausente."
)
return api_key
+89 -52
View File
@@ -1,74 +1,111 @@
from typing import List
def _get(d: dict | None, *keys, default=None):
cur = d or {}
for k in keys:
if not isinstance(cur, dict):
return default
cur = cur.get(k)
if cur is None:
return default
return cur
class PublicationNormalizer:
@staticmethod
def safe_get_title(summary):
t = summary.get("title")
def normalize(summary: dict, detail: dict | None = None) -> dict:
"""
summary: work-summary de ORCID
detail: work completo (puede ser None si la llamada falla)
"""
if t is None:
return None
# --- Core desde summary ---
put_code = summary.get("put-code")
# Caso 1: {"title": {"value": "..."}}
if isinstance(t, dict) and "title" in t and isinstance(t["title"], dict):
return t["title"].get("value")
title = _get(summary, "title", "title", "value")
type_ = summary.get("type")
# Caso 2: {"title": {"title": "..."}} (muy común en /works)
if isinstance(t, dict) and "title" in t and isinstance(t["title"], str):
return t["title"]
journal = _get(summary, "journal-title", "value")
# Caso 3: {"title": "string"}
if isinstance(t, str):
return t
year = _get(summary, "publication-date", "year", "value")
month = _get(summary, "publication-date", "month", "value")
day = _get(summary, "publication-date", "day", "value")
# Caso 4: {"value": "..."}
if isinstance(t, dict) and "value" in t:
return t["value"]
url = _get(summary, "url", "value")
short_description = summary.get("short-description")
return None
@staticmethod
def normalize_work(summary: dict) -> dict:
title = PublicationNormalizer.safe_get_title(summary)
# Journal title
journal_raw = summary.get("journal-title")
if isinstance(journal_raw, dict):
journal = journal_raw.get("value") or journal_raw.get("title")
else:
journal = journal_raw
# DOI
# DOI desde summary (external-ids)
doi = None
ext_ids = summary.get("external-ids", {}).get("external-id", [])
for ext in ext_ids:
external_ids_list: List[dict] = _get(
summary, "external-ids", "external-id", default=[]
) or []
for ext in external_ids_list:
if ext.get("external-id-type") == "doi":
doi = ext.get("external-id-value")
break
# Publication year
pub_year = (
summary.get("publication-date", {})
.get("year", {})
.get("value")
)
# --- Si tenemos detail, enriquecemos ---
subtitle = None
citation_type = None
citation_value = None
language_code = None
country = None
external_ids_full: List[dict] | None = None
contributors: List[dict] | None = None
# Type
work_type = summary.get("type")
if detail:
# Subtitle
subtitle = _get(detail, "title", "subtitle", "value") or subtitle
# put-code
put_code = summary.get("put-code")
# Citation
citation_type = _get(detail, "citation", "citation-type")
citation_value = _get(detail, "citation", "citation-value")
# Fingerprint
fingerprint = f"{title}-{doi}-{pub_year}-{work_type}"
if fingerprint:
fingerprint = fingerprint.lower().replace(" ", "")
# Language
language_code = detail.get("language-code")
# Country
country = _get(detail, "country", "value")
# External IDs completos
external_ids_full = _get(
detail, "external-ids", "external-id", default=[]
) or []
# Contributors
raw_contributors = _get(
detail, "contributors", "contributor", default=[]
) or []
contributors = []
for c in raw_contributors:
contributors.append(
{
"name": _get(c, "credit-name", "value"),
"orcid": _get(c, "contributor-orcid", "path"),
"role": _get(
c, "contributor-attributes", "contributor-role"
),
}
)
return {
"put_code": put_code,
"title": title or "Untitled",
"title": title,
"subtitle": subtitle,
"type": type_,
"journal": journal,
"pub_year": int(year) if year is not None else None,
"pub_month": int(month) if month is not None else None,
"pub_day": int(day) if day is not None else None,
"doi": doi,
"pub_year": pub_year,
"type": work_type,
"hash_fingerprint": fingerprint
"url": url,
"short_description": short_description,
"citation_type": citation_type,
"citation_value": citation_value,
"language_code": language_code,
"country": country,
"external_ids": external_ids_full,
"contributors": contributors,
"hash_fingerprint": None,
}
+43 -19
View File
@@ -1,28 +1,28 @@
import httpx
import os
from typing import Optional
import httpx
TOKEN_URL_SANDBOX = "https://sandbox.orcid.org/oauth/token"
BASE_URL_SANDBOX = "https://pub.sandbox.orcid.org/v3.0"
# Si en algún momento pasas a producción, cambiarías a:
# TOKEN_URL_PROD = "https://orcid.org/oauth/token"
# BASE_URL_PROD = "https://pub.orcid.org/v3.0"
class ORCIDClient:
TOKEN_URL = "https://sandbox.orcid.org/oauth/token"
BASE_URL = "https://pub.sandbox.orcid.org/v3.0"
# TOKEN_URL = "https://orcid.org/oauth/token"
# BASE_URL = "https://pub.orcid.org/v3.0"
def __init__(self):
self.client_id = os.getenv("ORCID_CLIENT_ID")
self.client_secret = os.getenv("ORCID_CLIENT_SECRET")
self._token_cache: Optional[str] = None
self.token_url = TOKEN_URL_SANDBOX
self.base_url = BASE_URL_SANDBOX
# ---------------------------------------------------------
# 1. Obtener token público
# ---------------------------------------------------------
def get_public_token(self) -> str:
"""
Obtiene un token público de ORCID (scope: /read-public).
Se cachea en memoria para evitar pedirlo cada vez.
"""
if self._token_cache:
return self._token_cache
@@ -30,11 +30,11 @@ class ORCIDClient:
"client_id": self.client_id,
"client_secret": self.client_secret,
"grant_type": "client_credentials",
"scope": "/read-public"
"scope": "/read-public",
}
with httpx.Client(timeout=20.0) as client:
response = client.post(self.TOKEN_URL, data=data)
response = client.post(self.token_url, data=data)
response.raise_for_status()
token = response.json()["access_token"]
self._token_cache = token
@@ -43,29 +43,53 @@ class ORCIDClient:
# ---------------------------------------------------------
# Headers comunes
# ---------------------------------------------------------
def _headers(self):
def _headers(self) -> dict:
token = self.get_public_token()
return {
"Accept": "application/json",
"Authorization": f"Bearer {token}"
"Authorization": f"Bearer {token}",
}
# ---------------------------------------------------------
# 2. Consultar /record
# ---------------------------------------------------------
def fetch_record(self, orcid_id: str) -> dict:
url = f"{self.BASE_URL}/{orcid_id}/record"
url = f"{self.base_url}/{orcid_id}/record"
with httpx.Client(timeout=20.0) as client:
response = client.get(url, headers=self._headers())
response.raise_for_status()
return response.json()
# ---------------------------------------------------------
# 3. Consultar /works
# 3. Consultar /works (summary)
# ---------------------------------------------------------
def fetch_works(self, orcid_id: str) -> dict:
url = f"{self.BASE_URL}/{orcid_id}/works"
url = f"{self.base_url}/{orcid_id}/works"
with httpx.Client(timeout=20.0) as client:
response = client.get(url, headers=self._headers())
response.raise_for_status()
return response.json()
# ---------------------------------------------------------
# 4. Consultar /work/{put_code} (detalle)
# ---------------------------------------------------------
def fetch_work_detail(self, orcid_id: str, put_code: int) -> dict | None:
url = f"{self.base_url}/{orcid_id}/work/{put_code}"
with httpx.Client(timeout=20.0) as client:
response = client.get(url, headers=self._headers())
if response.status_code != 200:
return None
return response.json()
# -------------------------------------------------------------------
# Funciones de módulo usadas en researchers.py
# -------------------------------------------------------------------
def get_works_summary(orcid_id: str) -> dict:
client = ORCIDClient()
return client.fetch_works(orcid_id)
def get_work_detail(orcid_id: str, put_code: int) -> dict | None:
client = ORCIDClient()
return client.fetch_work_detail(orcid_id, put_code)
-155
View File
@@ -1,155 +0,0 @@
from datetime import datetime
from xml.etree.ElementTree import Element, SubElement, tostring
from io import BytesIO
import zipfile
import json
class SWORDExporter:
ATOM_NS = "http://www.w3.org/2005/Atom"
DC_NS = "http://purl.org/dc/elements/1.1/"
# ---------------------------------------------------------
# 1) XML PRINCIPAL (sword.xml)
# ---------------------------------------------------------
@staticmethod
def export_feed_xml(researcher, publications) -> bytes:
feed = Element("feed", xmlns=SWORDExporter.ATOM_NS)
title = SubElement(feed, "title")
title.text = f"Publications for {researcher.orcid_id}"
author = SubElement(feed, "author")
name = SubElement(author, "name")
name.text = researcher.name or "Unknown"
updated = SubElement(feed, "updated")
updated.text = datetime.utcnow().isoformat() + "Z"
feed_id = SubElement(feed, "id")
feed_id.text = f"urn:uuid:{researcher.id}"
for pub in publications:
entry = SubElement(feed, "entry")
entry_id = SubElement(entry, "id")
entry_id.text = f"urn:uuid:{pub.id}"
entry_updated = SubElement(entry, "updated")
entry_updated.text = datetime.utcnow().isoformat() + "Z"
dc_title = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}title")
dc_title.text = pub.title
if pub.doi:
dc_identifier = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}identifier")
dc_identifier.text = f"doi:{pub.doi}"
if pub.pub_year:
dc_date = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}date")
dc_date.text = str(pub.pub_year)
if pub.type:
dc_type = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}type")
dc_type.text = pub.type
if pub.journal:
dc_source = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}source")
dc_source.text = pub.journal
xml_bytes = tostring(feed, encoding="utf-8", xml_declaration=True)
return xml_bytes
# ---------------------------------------------------------
# 2) manifest.txt
# ---------------------------------------------------------
@staticmethod
def generate_manifest(researcher, publications) -> str:
lines = [
"SWORD Deposit Package",
"----------------------",
f"Researcher ORCID: {researcher.orcid_id}",
f"Researcher Name: {researcher.name or 'Unknown'}",
f"Total Publications: {len(publications)}",
f"Generated At: {datetime.utcnow().isoformat()}Z",
"",
"Publications:",
]
for pub in publications:
lines.append(f"- {pub.title} ({pub.pub_year}) DOI={pub.doi}")
return "\n".join(lines)
# ---------------------------------------------------------
# 3) metadata.json
# ---------------------------------------------------------
@staticmethod
def generate_metadata_json(researcher, publications) -> str:
data = {
"researcher": {
"orcid_id": researcher.orcid_id,
"name": researcher.name,
"id": str(researcher.id),
},
"generated_at": datetime.utcnow().isoformat() + "Z",
"publications": [
{
"id": str(pub.id),
"title": pub.title,
"doi": pub.doi,
"year": pub.pub_year,
"type": pub.type,
"journal": pub.journal,
}
for pub in publications
],
}
return json.dumps(data, indent=4)
# ---------------------------------------------------------
# 4) mets.xml (versión simple)
# ---------------------------------------------------------
@staticmethod
def generate_mets_xml(researcher, publications) -> bytes:
mets = Element("mets", xmlns="http://www.loc.gov/METS/")
header = SubElement(mets, "metsHdr")
agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER")
name = SubElement(agent, "name")
name.text = "ORCID Exporter System"
dmd_sec = SubElement(mets, "dmdSec", ID="dmd1")
md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC")
xml_data = SubElement(md_wrap, "xmlData")
for pub in publications:
dc_title = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}title")
dc_title.text = pub.title
if pub.doi:
dc_id = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}identifier")
dc_id.text = f"doi:{pub.doi}"
return tostring(mets, encoding="utf-8", xml_declaration=True)
# ---------------------------------------------------------
# 5) ZIP FINAL
# ---------------------------------------------------------
@staticmethod
def export_zip(researcher, publications) -> bytes:
xml_bytes = SWORDExporter.export_feed_xml(researcher, publications)
manifest = SWORDExporter.generate_manifest(researcher, publications)
metadata_json = SWORDExporter.generate_metadata_json(researcher, publications)
mets_xml = SWORDExporter.generate_mets_xml(researcher, publications)
mem_file = BytesIO()
with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr("sword.xml", xml_bytes)
zf.writestr("manifest.txt", manifest)
zf.writestr("metadata.json", metadata_json)
zf.writestr("mets.xml", mets_xml)
mem_file.seek(0)
return mem_file.read()
+112
View File
@@ -0,0 +1,112 @@
from datetime import datetime
from xml.etree.ElementTree import Element, SubElement, tostring
from app.db.models import Publication, Researcher
ATOM_NS = "http://www.w3.org/2005/Atom"
DC_NS = "http://purl.org/dc/elements/1.1/"
EXTRA_NS = "http://example.org/orcid-extra" # namespace para campos extendidos
class SWORDGenerator:
@staticmethod
def generate_feed_xml(researcher: Researcher, publications: list[Publication]) -> bytes:
feed = Element("feed", {
"xmlns": ATOM_NS,
"xmlns:dc": DC_NS,
"xmlns:extra": EXTRA_NS
})
SubElement(feed, "title").text = f"Publications for {researcher.orcid_id}"
author = SubElement(feed, "author")
SubElement(author, "name").text = researcher.name or "Unknown"
SubElement(feed, "updated").text = datetime.utcnow().isoformat() + "Z"
SubElement(feed, "id").text = f"urn:uuid:{researcher.id}"
for pub in publications:
entry = SubElement(feed, "entry")
SubElement(entry, "id").text = f"urn:uuid:{pub.id}"
SubElement(entry, "updated").text = datetime.utcnow().isoformat() + "Z"
# Title
SubElement(entry, f"{{{DC_NS}}}title").text = pub.title or "Untitled"
# Subtitle
if pub.subtitle:
SubElement(entry, f"{{{EXTRA_NS}}}subtitle").text = pub.subtitle
# DOI
if pub.doi:
SubElement(entry, f"{{{DC_NS}}}identifier").text = f"doi:{pub.doi}"
# Journal
if pub.journal:
SubElement(entry, f"{{{DC_NS}}}source").text = pub.journal
# URL
if pub.url:
SubElement(entry, f"{{{DC_NS}}}relation").text = pub.url
# Short description
if pub.short_description:
SubElement(entry, f"{{{DC_NS}}}description").text = pub.short_description
# Citation
if pub.citation_value:
cit = SubElement(entry, f"{{{EXTRA_NS}}}citation")
SubElement(cit, "type").text = pub.citation_type or "unknown"
SubElement(cit, "value").text = pub.citation_value
# Language
if pub.language_code:
SubElement(entry, f"{{{DC_NS}}}language").text = pub.language_code
# Country
if pub.country:
SubElement(entry, f"{{{EXTRA_NS}}}country").text = pub.country
# External IDs
if pub.external_ids:
ext_ids_el = SubElement(entry, f"{{{EXTRA_NS}}}external_ids")
for ext in pub.external_ids:
ext_el = SubElement(ext_ids_el, "external_id")
for k, v in ext.items():
if isinstance(v, dict) and "value" in v:
SubElement(ext_el, k).text = v["value"]
else:
SubElement(ext_el, k).text = str(v)
# Contributors
if pub.contributors:
contribs_el = SubElement(entry, f"{{{EXTRA_NS}}}contributors")
for c in pub.contributors:
c_el = SubElement(contribs_el, "contributor")
SubElement(c_el, "name").text = c.get("name")
SubElement(c_el, "orcid").text = c.get("orcid")
SubElement(c_el, "role").text = c.get("role")
# Date
if pub.pub_year:
date_str = str(pub.pub_year)
if pub.pub_month:
date_str += f"-{pub.pub_month:02d}"
if pub.pub_day:
date_str += f"-{pub.pub_day:02d}"
SubElement(entry, f"{{{DC_NS}}}date").text = date_str
# Type
if pub.type:
SubElement(entry, f"{{{DC_NS}}}type").text = pub.type
# Status (new / updated / unchanged)
if hasattr(pub, "status") and pub.status:
SubElement(entry, f"{{{EXTRA_NS}}}status").text = pub.status
# Last modified
if pub.last_modified:
SubElement(entry, f"{{{EXTRA_NS}}}last_modified").text = pub.last_modified.isoformat()
return tostring(feed, encoding="utf-8", xml_declaration=True)
+57 -17
View File
@@ -1,10 +1,12 @@
from sqlalchemy.orm import Session
import httpx
from app.services.orcid_client import ORCIDClient
from app.services.normalizer import PublicationNormalizer
from app.repositories.researcher_repository import ResearcherRepository
from app.repositories.publication_repository import PublicationRepository
from app.repositories.syncjob_repository import SyncJobRepository
import httpx
from app.db.repositories.researcher_repository import ResearcherRepository
from app.db.repositories.publication_repository import PublicationRepository
from app.db.repositories.syncjob_repository import SyncJobRepository
class SyncService:
@@ -16,8 +18,6 @@ class SyncService:
"""
Sincroniza las publicaciones de un investigador con manejo robusto de errores.
"""
# 1. Obtener o crear investigador
try:
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
@@ -35,14 +35,23 @@ class SyncService:
if e.response.status_code == 404:
return {
"status": "error",
"message": f"El ORCID {orcid_id} no existe en Sandbox."
"code": 404,
"message": f"El ORCID {orcid_id} no existe en ORCID."
}
return {"status": "error", "message": str(e)}
return {
"status": "error",
"code": e.response.status_code,
"message": f"Error al consultar ORCID: {str(e)}"
}
except Exception as e:
return {
"status": "error",
"code": 500,
"message": f"Error interno durante la sincronización: {str(e)}"
}
# 2. Crear SyncJob
job = SyncJobRepository.start_job(db, researcher.id)
# 3. Obtener works
try:
works_raw = self.orcid_client.fetch_works(orcid_id)
except httpx.HTTPStatusError as e:
@@ -56,19 +65,27 @@ class SyncService:
"updated_records": 0,
"total": 0
}
return {"status": "error", "message": str(e)}
return {
"status": "error",
"code": e.response.status_code,
"message": f"Error al obtener works de ORCID: {str(e)}"
}
except Exception as e:
return {
"status": "error",
"code": 500,
"message": f"Error interno al obtener works: {str(e)}"
}
groups = works_raw.get("group", [])
new_records = 0
updated_records = 0
# 4. Procesar works
for group in groups:
summary = group["work-summary"][0]
normalized = PublicationNormalizer.normalize_work(summary)
# 🔥 AHORA SE DETECTAN DUPLICADOS POR put_code
existing = PublicationRepository.get_by_put_code(
db, researcher.id, normalized["put_code"]
)
@@ -80,17 +97,40 @@ class SyncService:
PublicationRepository.create(db, researcher.id, normalized)
new_records += 1
# 5. Finalizar SyncJob
SyncJobRepository.finish_job(db, job, new_records, updated_records)
# 6. Actualizar last_sync_at
ResearcherRepository.update_last_sync(db, researcher)
return {
"status": "ok",
"message": "Sincronización completada correctamente.",
"researcher": researcher.orcid_id,
"researcher_id": researcher.id,
"new_records": new_records,
"updated_records": updated_records,
"total": new_records + updated_records
}
def sync_and_get_full(self, db: Session, orcid_id: str):
"""
Sincroniza (si es necesario) y devuelve investigador + publicaciones.
Pensado para el buscador: una sola petición.
"""
sync_result = self.sync_researcher(db, orcid_id)
if sync_result.get("status") == "error":
return sync_result
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
return {
"status": "error",
"code": 500,
"message": "Error interno: investigador no encontrado tras sincronización."
}
publications = PublicationRepository.list_by_researcher(db, researcher.id)
return {
"status": "ok",
"researcher": researcher,
"publications": publications
}
+165
View File
@@ -0,0 +1,165 @@
import io
import zipfile
import json
from datetime import datetime
from xml.etree.ElementTree import Element, SubElement, tostring
from app.db.models import Publication, Researcher
from app.services.sword_generator import SWORDGenerator
class ZIPGenerator:
# ---------------------------------------------------------
# MANIFEST.TXT — más completo
# ---------------------------------------------------------
@staticmethod
def generate_manifest(researcher, publications):
lines = [
"SWORD Deposit Package",
"----------------------",
f"Researcher ORCID: {researcher.orcid_id}",
f"Researcher Name: {researcher.name}",
f"Researcher UUID: {researcher.id}",
f"Total Publications: {len(publications)}",
f"Generated At: {datetime.utcnow().isoformat()}Z",
"",
"Publications:",
]
for pub in publications:
year = pub.pub_year or "Unknown"
lines.append(
f"- {pub.title} ({year}) | DOI={pub.doi} | TYPE={pub.type}"
)
return "\n".join(lines)
# ---------------------------------------------------------
# METADATA.JSON — ahora con TODOS los campos
# ---------------------------------------------------------
@staticmethod
def generate_metadata_json(researcher, publications):
data = {
"researcher": {
"orcid_id": researcher.orcid_id,
"name": researcher.name,
"id": str(researcher.id),
"last_sync_at": researcher.last_sync_at.isoformat() if researcher.last_sync_at else None,
},
"generated_at": datetime.utcnow().isoformat() + "Z",
"publications": [],
}
for pub in publications:
data["publications"].append({
"id": str(pub.id),
"put_code": pub.put_code,
"title": pub.title,
"subtitle": pub.subtitle,
"doi": pub.doi,
"journal": pub.journal,
"type": pub.type,
"url": pub.url,
"short_description": pub.short_description,
"citation_type": pub.citation_type,
"citation_value": pub.citation_value,
"language_code": pub.language_code,
"country": pub.country,
"pub_year": pub.pub_year,
"pub_month": pub.pub_month,
"pub_day": pub.pub_day,
"external_ids": pub.external_ids,
"contributors": pub.contributors,
"hash_fingerprint": pub.hash_fingerprint,
"last_modified": pub.last_modified.isoformat() if pub.last_modified else None,
"status": getattr(pub, "status", None),
})
return json.dumps(data, indent=4)
# ---------------------------------------------------------
# METS.XML — ampliado con más metadatos
# ---------------------------------------------------------
@staticmethod
def generate_mets_xml(researcher, publications):
mets = Element("mets", xmlns="http://www.loc.gov/METS/")
header = SubElement(mets, "metsHdr")
agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER")
SubElement(agent, "name").text = "ORCID Exporter System"
dmd_sec = SubElement(mets, "dmdSec", ID="dmd1")
md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC")
xml_data = SubElement(md_wrap, "xmlData")
for pub in publications:
# Title
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}title").text = pub.title
# Subtitle
if pub.subtitle:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}description").text = pub.subtitle
# DOI
if pub.doi:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}identifier").text = f"doi:{pub.doi}"
# Journal
if pub.journal:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}source").text = pub.journal
# URL
if pub.url:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}relation").text = pub.url
# Description
if pub.short_description:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}description").text = pub.short_description
# Citation
if pub.citation_value:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}bibliographicCitation").text = pub.citation_value
# Language
if pub.language_code:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}language").text = pub.language_code
# Country
if pub.country:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}coverage").text = pub.country
# Date
if pub.pub_year:
date_str = str(pub.pub_year)
if pub.pub_month:
date_str += f"-{pub.pub_month:02d}"
if pub.pub_day:
date_str += f"-{pub.pub_day:02d}"
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}date").text = date_str
# Type
if pub.type:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}type").text = pub.type
return tostring(mets, encoding="utf-8", xml_declaration=True)
# ---------------------------------------------------------
# ZIP FINAL
# ---------------------------------------------------------
@staticmethod
def generate_zip(researcher, publications):
xml_bytes = SWORDGenerator.generate_feed_xml(researcher, publications)
manifest = ZIPGenerator.generate_manifest(researcher, publications)
metadata_json = ZIPGenerator.generate_metadata_json(researcher, publications)
mets_xml = ZIPGenerator.generate_mets_xml(researcher, publications)
mem_file = io.BytesIO()
with zipfile.ZipFile(mem_file, "w", zipfile.ZIP_DEFLATED) as zf:
zf.writestr("sword.xml", xml_bytes)
zf.writestr("manifest.txt", manifest)
zf.writestr("metadata.json", metadata_json)
zf.writestr("mets.xml", mets_xml)
mem_file.seek(0)
return mem_file.read()
+3 -1
View File
@@ -8,4 +8,6 @@ python-dotenv
lxml
apscheduler
authlib
redis
redis
APScheduler==3.10.4
requests