feat: implement researcher and publication management with sync functionality

2026-04-21 13:59:41 +02:00
parent 7717e2a5b2
commit a286c2e3ae
13 changed files with 698 additions and 23 deletions
@@ -0,0 +1,120 @@
 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import Response
 from sqlalchemy.orm import Session
 from app.schema.publication import PublicationSchema
 from app.db.session import get_db
 from app.repositories.researcher_repository import ResearcherRepository
 from app.repositories.publication_repository import PublicationRepository
 from app.services.sync_service import SyncService
 from app.services.sword_exporter import SWORDExporter
 from app.utils.orcid_validator import is_valid_orcid
 router = APIRouter(prefix="/researchers", tags=["researchers"])
 def validate_orcid_or_400(orcid_id: str):
    if not is_valid_orcid(orcid_id):
        raise HTTPException(
            status_code=400,
            detail=f"ORCID ID '{orcid_id}' no es válido según el formato y dígito de control."
        )
@router.post("/", response_model=dict)
 def create_researcher(orcid_id: str, db: Session = Depends(get_db)):
    validate_orcid_or_400(orcid_id)
    existing = ResearcherRepository.get_by_orcid(db, orcid_id)
    if existing:
        return {
            "status": "ok",
            "message": "Researcher ya existe.",
            "orcid_id": existing.orcid_id,
            "id": existing.id
        }
    # Aquí podrías opcionalmente validar que el ORCID existe en ORCID API
    researcher = ResearcherRepository.create(db, orcid_id, name=None)
    return {
        "status": "ok",
        "message": "Researcher creado correctamente.",
        "orcid_id": researcher.orcid_id,
        "id": researcher.id
    }
@router.get("/{orcid_id}", response_model=dict)
 def get_researcher(orcid_id: str, db: Session = Depends(get_db)):
    validate_orcid_or_400(orcid_id)
    researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
    if not researcher:
        raise HTTPException(status_code=404, detail="Researcher not found")
    return {
        "orcid_id": researcher.orcid_id,
        "name": researcher.name,
        "authenticated": researcher.authenticated,
        "access_token": researcher.access_token,
        "id": researcher.id,
        "last_sync_at": researcher.last_sync_at,
    }
@router.post("/{orcid_id}/sync", response_model=dict)
 def sync_researcher(orcid_id: str, db: Session = Depends(get_db)):
    validate_orcid_or_400(orcid_id)
    service = SyncService()
    result = service.sync_researcher(db, orcid_id)
    return result
@router.get("/{orcid_id}/publications", response_model=list[PublicationSchema], tags=["researchers"])
 def get_publications(orcid_id: str, db: Session = Depends(get_db)):
    researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
    if not researcher:
        raise HTTPException(status_code=404, detail="Researcher not found")
    return researcher.publications
@router.get("/{orcid_id}/export/sword.xml")
 def export_sword_xml(orcid_id: str, db: Session = Depends(get_db)):
    validate_orcid_or_400(orcid_id)
    researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
    if not researcher:
        raise HTTPException(status_code=404, detail="Researcher not found")
    pubs = PublicationRepository.list_by_researcher(db, researcher.id)
    xml_bytes = SWORDExporter.export_feed_xml(researcher, pubs)
    return Response(
        content=xml_bytes,
        media_type="application/xml",
        headers={
            "Content-Disposition": f'attachment; filename="sword_{orcid_id}.xml"'
        }
    )
@router.get("/{orcid_id}/export/sword.zip")
 def export_sword_zip(orcid_id: str, db: Session = Depends(get_db)):
    validate_orcid_or_400(orcid_id)
    researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
    if not researcher:
        raise HTTPException(status_code=404, detail="Researcher not found")
    pubs = PublicationRepository.list_by_researcher(db, researcher.id)
    zip_bytes = SWORDExporter.export_zip(researcher, pubs)
    return Response(
        content=zip_bytes,
        media_type="application/zip",
        headers={
            "Content-Disposition": f'attachment; filename="sword_{orcid_id}.zip"'
        }
    )
@@ -1,9 +1,11 @@
-from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text
+from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text, ForeignKey
 from sqlalchemy.dialects.postgresql import UUID
 from sqlalchemy.sql import func
 from sqlalchemy.orm import relationship
 from .session import Base
 import uuid
 class Researcher(Base):
    __tablename__ = "researchers"
@@ -11,18 +13,48 @@ class Researcher(Base):
    orcid_id = Column(String(19), unique=True, nullable=False)
    name = Column(Text)
    authenticated = Column(Boolean, default=False)
-    access_token = Column(Text)
+    access_token = Column(Text, nullable=True)
-    last_sync_at = Column(DateTime)
+    last_sync_at = Column(DateTime(timezone=True), server_default=func.now())
    publications = relationship(
        "Publication",
        back_populates="researcher",
        cascade="all, delete-orphan"
    )
    sync_jobs = relationship(
        "SyncJob",
        back_populates="researcher",
        cascade="all, delete-orphan"
    )
 class Publication(Base):
    __tablename__ = "publications"
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
-    researcher_id = Column(UUID(as_uuid=True))
+    researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"))
    put_code = Column(Integer)
    title = Column(Text)
    journal = Column(Text)
    doi = Column(Text)
    pub_year = Column(Integer)
    type = Column(Text)
    hash_fingerprint = Column(Text)
-    last_modified = Column(DateTime)
+    last_modified = Column(DateTime(timezone=True))
    researcher = relationship("Researcher", back_populates="publications")
 class SyncJob(Base):
    __tablename__ = "sync_jobs"
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"))
    status = Column(String(20))
    new_records = Column(Integer, default=0)
    updated_records = Column(Integer, default=0)
    started_at = Column(DateTime(timezone=True), server_default=func.now())
    finished_at = Column(DateTime(timezone=True))
    researcher = relationship("Researcher", back_populates="sync_jobs")
@@ -4,7 +4,23 @@ import os
 DATABASE_URL = os.getenv("DATABASE_URL")
-engine = create_engine(DATABASE_URL)
+engine = create_engine(
-SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False)
+    DATABASE_URL,
    future=True,
    echo=False
 )
 SessionLocal = sessionmaker(
    autocommit=False,
    autoflush=False,
    bind=engine
 )
 Base = declarative_base()
 def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
@@ -1,9 +1,32 @@
 from fastapi import FastAPI
-from app.services.orcid_client import ORCIDClient
+from app.api.researchers import router as researchers_router
 from app.db.session import Base, engine
 app = FastAPI()
-@app.get("/orcid/{orcid_id}/works")
+app = FastAPI(
-def test_works(orcid_id: str):
+    title="ORCID SWORD Backend",
-    client = ORCIDClient()
+    description="Backend para sincronización ORCID y exportación SWORD",
-    return client.fetch_works(orcid_id)
+    version="1.0.0"
 )
 # ---------------------------------------------------------
 # Crear tablas al iniciar la aplicación
 # ---------------------------------------------------------
@app.on_event("startup")
 def startup_event():
    Base.metadata.create_all(bind=engine)
 # ---------------------------------------------------------
 # Healthcheck
 # ---------------------------------------------------------
@app.get("/health")
 def health():
    return {"status": "ok"}
 # ---------------------------------------------------------
 # Registrar routers
 # ---------------------------------------------------------
 app.include_router(researchers_router)
@@ -0,0 +1,67 @@
 from sqlalchemy.orm import Session
 from app.db.models import Publication
 class PublicationRepository:
    @staticmethod
    def get_by_put_code(db: Session, researcher_id: str, put_code: int):
        """
        Devuelve una publicación existente por put_code (único en ORCID).
        """
        return (
            db.query(Publication)
            .filter(
                Publication.researcher_id == researcher_id,
                Publication.put_code == put_code
            )
            .first()
        )
    @staticmethod
    def create(db: Session, researcher_id: str, data: dict):
        """
        Crea una nueva publicación normalizada.
        """
        pub = Publication(
            researcher_id=researcher_id,
            put_code=data["put_code"],
            title=data["title"],
            journal=data["journal"],
            doi=data["doi"],
            pub_year=data["pub_year"],
            type=data["type"],
            hash_fingerprint=data["hash_fingerprint"]
        )
        db.add(pub)
        db.commit()
        db.refresh(pub)
        return pub
    @staticmethod
    def update(db: Session, publication: Publication, data: dict):
        """
        Actualiza una publicación existente si ORCID ha cambiado algo.
        """
        publication.title = data["title"]
        publication.journal = data["journal"]
        publication.doi = data["doi"]
        publication.pub_year = data["pub_year"]
        publication.type = data["type"]
        publication.hash_fingerprint = data["hash_fingerprint"]
        db.commit()
        db.refresh(publication)
        return publication
    @staticmethod
    def list_by_researcher(db: Session, researcher_id: str):
        """
        Lista todas las publicaciones de un investigador.
        """
        return (
            db.query(Publication)
            .filter(Publication.researcher_id == researcher_id)
            .order_by(Publication.pub_year.desc().nullslast())
            .all()
        )
@@ -0,0 +1,25 @@
 from sqlalchemy.orm import Session
 from app.db.models import Researcher
 from sqlalchemy.sql import func
 class ResearcherRepository:
    @staticmethod
    def get_by_orcid(db: Session, orcid_id: str):
        return db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first()
    @staticmethod
    def create(db: Session, orcid_id: str, name: str = None):
        researcher = Researcher(orcid_id=orcid_id, name=name)
        db.add(researcher)
        db.commit()
        db.refresh(researcher)
        return researcher
    @staticmethod
    def update_last_sync(db: Session, researcher: Researcher):
        researcher.last_sync_at = func.now()
        db.commit()
        db.refresh(researcher)
        return researcher
@@ -0,0 +1,28 @@
 from sqlalchemy.orm import Session
 from app.db.models import SyncJob
 from sqlalchemy.sql import func
 class SyncJobRepository:
    @staticmethod
    def start_job(db: Session, researcher_id: str):
        job = SyncJob(
            researcher_id=researcher_id,
            status="running",
            started_at=func.now()
        )
        db.add(job)
        db.commit()
        db.refresh(job)
        return job
    @staticmethod
    def finish_job(db: Session, job: SyncJob, new_records: int, updated_records: int):
        job.status = "finished"
        job.new_records = new_records
        job.updated_records = updated_records
        job.finished_at = func.now()
        db.commit()
        db.refresh(job)
        return job
@@ -0,0 +1,16 @@
 from pydantic import BaseModel
 from uuid import UUID
 class PublicationSchema(BaseModel):
    id: UUID
    put_code: int | None = None
    title: str
    journal: str | None = None
    doi: str | None = None
    pub_year: int | None = None
    type: str | None = None
    hash_fingerprint: str | None = None
    last_modified: str | None = None
    class Config:
        from_attributes = True
@@ -0,0 +1,74 @@
 class PublicationNormalizer:
    @staticmethod
    def safe_get_title(summary):
        t = summary.get("title")
        if t is None:
            return None
        # Caso 1: {"title": {"value": "..."}}
        if isinstance(t, dict) and "title" in t and isinstance(t["title"], dict):
            return t["title"].get("value")
        # Caso 2: {"title": {"title": "..."}} (muy común en /works)
        if isinstance(t, dict) and "title" in t and isinstance(t["title"], str):
            return t["title"]
        # Caso 3: {"title": "string"}
        if isinstance(t, str):
            return t
        # Caso 4: {"value": "..."}
        if isinstance(t, dict) and "value" in t:
            return t["value"]
        return None
    @staticmethod
    def normalize_work(summary: dict) -> dict:
        title = PublicationNormalizer.safe_get_title(summary)
        # Journal title
        journal_raw = summary.get("journal-title")
        if isinstance(journal_raw, dict):
            journal = journal_raw.get("value") or journal_raw.get("title")
        else:
            journal = journal_raw
        # DOI
        doi = None
        ext_ids = summary.get("external-ids", {}).get("external-id", [])
        for ext in ext_ids:
            if ext.get("external-id-type") == "doi":
                doi = ext.get("external-id-value")
                break
        # Publication year
        pub_year = (
            summary.get("publication-date", {})
                   .get("year", {})
                   .get("value")
        )
        # Type
        work_type = summary.get("type")
        # put-code
        put_code = summary.get("put-code")
        # Fingerprint
        fingerprint = f"{title}-{doi}-{pub_year}-{work_type}"
        if fingerprint:
            fingerprint = fingerprint.lower().replace(" ", "")
        return {
            "put_code": put_code,
            "title": title or "Untitled",
            "journal": journal,
            "doi": doi,
            "pub_year": pub_year,
            "type": work_type,
            "hash_fingerprint": fingerprint
        }
@@ -2,18 +2,13 @@ import httpx
 import os
 from typing import Optional
 class ORCIDClient:
    """
    Cliente para interactuar con la Public API de ORCID.
    Permite:
    - Obtener token público
    - Consultar /record
    - Consultar /works
    """
-    TOKEN_URL = "https://orcid.org/oauth/token"
+    TOKEN_URL = "https://sandbox.orcid.org/oauth/token"
-    BASE_URL = "https://pub.orcid.org/v3.0"
+    BASE_URL = "https://pub.sandbox.orcid.org/v3.0"
    # TOKEN_URL = "https://orcid.org/oauth/token"
    # BASE_URL = "https://pub.orcid.org/v3.0"
    def __init__(self):
        self.client_id = os.getenv("ORCID_CLIENT_ID")
@@ -0,0 +1,155 @@
 from datetime import datetime
 from xml.etree.ElementTree import Element, SubElement, tostring
 from io import BytesIO
 import zipfile
 import json
 class SWORDExporter:
    ATOM_NS = "http://www.w3.org/2005/Atom"
    DC_NS = "http://purl.org/dc/elements/1.1/"
    # ---------------------------------------------------------
    # 1) XML PRINCIPAL (sword.xml)
    # ---------------------------------------------------------
    @staticmethod
    def export_feed_xml(researcher, publications) -> bytes:
        feed = Element("feed", xmlns=SWORDExporter.ATOM_NS)
        title = SubElement(feed, "title")
        title.text = f"Publications for {researcher.orcid_id}"
        author = SubElement(feed, "author")
        name = SubElement(author, "name")
        name.text = researcher.name or "Unknown"
        updated = SubElement(feed, "updated")
        updated.text = datetime.utcnow().isoformat() + "Z"
        feed_id = SubElement(feed, "id")
        feed_id.text = f"urn:uuid:{researcher.id}"
        for pub in publications:
            entry = SubElement(feed, "entry")
            entry_id = SubElement(entry, "id")
            entry_id.text = f"urn:uuid:{pub.id}"
            entry_updated = SubElement(entry, "updated")
            entry_updated.text = datetime.utcnow().isoformat() + "Z"
            dc_title = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}title")
            dc_title.text = pub.title
            if pub.doi:
                dc_identifier = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}identifier")
                dc_identifier.text = f"doi:{pub.doi}"
            if pub.pub_year:
                dc_date = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}date")
                dc_date.text = str(pub.pub_year)
            if pub.type:
                dc_type = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}type")
                dc_type.text = pub.type
            if pub.journal:
                dc_source = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}source")
                dc_source.text = pub.journal
        xml_bytes = tostring(feed, encoding="utf-8", xml_declaration=True)
        return xml_bytes
    # ---------------------------------------------------------
    # 2) manifest.txt
    # ---------------------------------------------------------
    @staticmethod
    def generate_manifest(researcher, publications) -> str:
        lines = [
            "SWORD Deposit Package",
            "----------------------",
            f"Researcher ORCID: {researcher.orcid_id}",
            f"Researcher Name: {researcher.name or 'Unknown'}",
            f"Total Publications: {len(publications)}",
            f"Generated At: {datetime.utcnow().isoformat()}Z",
            "",
            "Publications:",
        ]
        for pub in publications:
            lines.append(f"- {pub.title} ({pub.pub_year}) DOI={pub.doi}")
        return "\n".join(lines)
    # ---------------------------------------------------------
    # 3) metadata.json
    # ---------------------------------------------------------
    @staticmethod
    def generate_metadata_json(researcher, publications) -> str:
        data = {
            "researcher": {
                "orcid_id": researcher.orcid_id,
                "name": researcher.name,
                "id": str(researcher.id),
            },
            "generated_at": datetime.utcnow().isoformat() + "Z",
            "publications": [
                {
                    "id": str(pub.id),
                    "title": pub.title,
                    "doi": pub.doi,
                    "year": pub.pub_year,
                    "type": pub.type,
                    "journal": pub.journal,
                }
                for pub in publications
            ],
        }
        return json.dumps(data, indent=4)
    # ---------------------------------------------------------
    # 4) mets.xml (versión simple)
    # ---------------------------------------------------------
    @staticmethod
    def generate_mets_xml(researcher, publications) -> bytes:
        mets = Element("mets", xmlns="http://www.loc.gov/METS/")
        header = SubElement(mets, "metsHdr")
        agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER")
        name = SubElement(agent, "name")
        name.text = "ORCID Exporter System"
        dmd_sec = SubElement(mets, "dmdSec", ID="dmd1")
        md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC")
        xml_data = SubElement(md_wrap, "xmlData")
        for pub in publications:
            dc_title = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}title")
            dc_title.text = pub.title
            if pub.doi:
                dc_id = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}identifier")
                dc_id.text = f"doi:{pub.doi}"
        return tostring(mets, encoding="utf-8", xml_declaration=True)
    # ---------------------------------------------------------
    # 5) ZIP FINAL
    # ---------------------------------------------------------
    @staticmethod
    def export_zip(researcher, publications) -> bytes:
        xml_bytes = SWORDExporter.export_feed_xml(researcher, publications)
        manifest = SWORDExporter.generate_manifest(researcher, publications)
        metadata_json = SWORDExporter.generate_metadata_json(researcher, publications)
        mets_xml = SWORDExporter.generate_mets_xml(researcher, publications)
        mem_file = BytesIO()
        with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
            zf.writestr("sword.xml", xml_bytes)
            zf.writestr("manifest.txt", manifest)
            zf.writestr("metadata.json", metadata_json)
            zf.writestr("mets.xml", mets_xml)
        mem_file.seek(0)
        return mem_file.read()
@@ -0,0 +1,96 @@
 from sqlalchemy.orm import Session
 from app.services.orcid_client import ORCIDClient
 from app.services.normalizer import PublicationNormalizer
 from app.repositories.researcher_repository import ResearcherRepository
 from app.repositories.publication_repository import PublicationRepository
 from app.repositories.syncjob_repository import SyncJobRepository
 import httpx
 class SyncService:
    def __init__(self):
        self.orcid_client = ORCIDClient()
    def sync_researcher(self, db: Session, orcid_id: str):
        """
        Sincroniza las publicaciones de un investigador con manejo robusto de errores.
        """
        # 1. Obtener o crear investigador
        try:
            researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
            if not researcher:
                record = self.orcid_client.fetch_record(orcid_id)
                name = (
                    record.get("person", {})
                          .get("name", {})
                          .get("given-names", {})
                          .get("value")
                )
                researcher = ResearcherRepository.create(db, orcid_id, name)
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 404:
                return {
                    "status": "error",
                    "message": f"El ORCID {orcid_id} no existe en Sandbox."
                }
            return {"status": "error", "message": str(e)}
        # 2. Crear SyncJob
        job = SyncJobRepository.start_job(db, researcher.id)
        # 3. Obtener works
        try:
            works_raw = self.orcid_client.fetch_works(orcid_id)
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 404:
                SyncJobRepository.finish_job(db, job, 0, 0)
                ResearcherRepository.update_last_sync(db, researcher)
                return {
                    "status": "ok",
                    "message": "El ORCID existe pero no tiene publicaciones públicas.",
                    "new_records": 0,
                    "updated_records": 0,
                    "total": 0
                }
            return {"status": "error", "message": str(e)}
        groups = works_raw.get("group", [])
        new_records = 0
        updated_records = 0
        # 4. Procesar works
        for group in groups:
            summary = group["work-summary"][0]
            normalized = PublicationNormalizer.normalize_work(summary)
            # 🔥 AHORA SE DETECTAN DUPLICADOS POR put_code
            existing = PublicationRepository.get_by_put_code(
                db, researcher.id, normalized["put_code"]
            )
            if existing:
                PublicationRepository.update(db, existing, normalized)
                updated_records += 1
            else:
                PublicationRepository.create(db, researcher.id, normalized)
                new_records += 1
        # 5. Finalizar SyncJob
        SyncJobRepository.finish_job(db, job, new_records, updated_records)
        # 6. Actualizar last_sync_at
        ResearcherRepository.update_last_sync(db, researcher)
        return {
            "status": "ok",
            "message": "Sincronización completada correctamente.",
            "researcher": researcher.orcid_id,
            "new_records": new_records,
            "updated_records": updated_records,
            "total": new_records + updated_records
        }
@@ -0,0 +1,28 @@
 import re
 ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$")
 def is_valid_orcid(orcid_id: str) -> bool:
    """
    Valida un ORCID ID:
    - Formato: 0000-0000-0000-0000
    - Dígito de control según ISO 7064 Mod 11-2
    """
    if not ORCID_REGEX.match(orcid_id):
        return False
    # Quitar guiones
    digits = orcid_id.replace("-", "")
    total = 0
    # Los primeros 15 dígitos
    for char in digits[:-1]:
        total = (total + int(char)) * 2
    # Resto
    remainder = total % 11
    result = (12 - remainder) % 11
    check_digit = "X" if result == 10 else str(result)
    return digits[-1] == check_digit