diff --git a/backend/app/api/researchers.py b/backend/app/api/researchers.py new file mode 100644 index 0000000..6c56d7d --- /dev/null +++ b/backend/app/api/researchers.py @@ -0,0 +1,120 @@ +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import Response +from sqlalchemy.orm import Session +from app.schema.publication import PublicationSchema +from app.db.session import get_db +from app.repositories.researcher_repository import ResearcherRepository +from app.repositories.publication_repository import PublicationRepository +from app.services.sync_service import SyncService +from app.services.sword_exporter import SWORDExporter +from app.utils.orcid_validator import is_valid_orcid + +router = APIRouter(prefix="/researchers", tags=["researchers"]) + + +def validate_orcid_or_400(orcid_id: str): + if not is_valid_orcid(orcid_id): + raise HTTPException( + status_code=400, + detail=f"ORCID ID '{orcid_id}' no es válido según el formato y dígito de control." + ) + + +@router.post("/", response_model=dict) +def create_researcher(orcid_id: str, db: Session = Depends(get_db)): + validate_orcid_or_400(orcid_id) + + existing = ResearcherRepository.get_by_orcid(db, orcid_id) + if existing: + return { + "status": "ok", + "message": "Researcher ya existe.", + "orcid_id": existing.orcid_id, + "id": existing.id + } + + # Aquí podrías opcionalmente validar que el ORCID existe en ORCID API + researcher = ResearcherRepository.create(db, orcid_id, name=None) + + return { + "status": "ok", + "message": "Researcher creado correctamente.", + "orcid_id": researcher.orcid_id, + "id": researcher.id + } + + +@router.get("/{orcid_id}", response_model=dict) +def get_researcher(orcid_id: str, db: Session = Depends(get_db)): + validate_orcid_or_400(orcid_id) + + researcher = ResearcherRepository.get_by_orcid(db, orcid_id) + if not researcher: + raise HTTPException(status_code=404, detail="Researcher not found") + + return { + "orcid_id": researcher.orcid_id, + "name": researcher.name, + "authenticated": researcher.authenticated, + "access_token": researcher.access_token, + "id": researcher.id, + "last_sync_at": researcher.last_sync_at, + } + + +@router.post("/{orcid_id}/sync", response_model=dict) +def sync_researcher(orcid_id: str, db: Session = Depends(get_db)): + validate_orcid_or_400(orcid_id) + + service = SyncService() + result = service.sync_researcher(db, orcid_id) + return result + + +@router.get("/{orcid_id}/publications", response_model=list[PublicationSchema], tags=["researchers"]) +def get_publications(orcid_id: str, db: Session = Depends(get_db)): + researcher = ResearcherRepository.get_by_orcid(db, orcid_id) + if not researcher: + raise HTTPException(status_code=404, detail="Researcher not found") + return researcher.publications + + + +@router.get("/{orcid_id}/export/sword.xml") +def export_sword_xml(orcid_id: str, db: Session = Depends(get_db)): + validate_orcid_or_400(orcid_id) + + researcher = ResearcherRepository.get_by_orcid(db, orcid_id) + if not researcher: + raise HTTPException(status_code=404, detail="Researcher not found") + + pubs = PublicationRepository.list_by_researcher(db, researcher.id) + xml_bytes = SWORDExporter.export_feed_xml(researcher, pubs) + + return Response( + content=xml_bytes, + media_type="application/xml", + headers={ + "Content-Disposition": f'attachment; filename="sword_{orcid_id}.xml"' + } + ) + + +@router.get("/{orcid_id}/export/sword.zip") +def export_sword_zip(orcid_id: str, db: Session = Depends(get_db)): + validate_orcid_or_400(orcid_id) + + researcher = ResearcherRepository.get_by_orcid(db, orcid_id) + if not researcher: + raise HTTPException(status_code=404, detail="Researcher not found") + + pubs = PublicationRepository.list_by_researcher(db, researcher.id) + zip_bytes = SWORDExporter.export_zip(researcher, pubs) + + return Response( + content=zip_bytes, + media_type="application/zip", + headers={ + "Content-Disposition": f'attachment; filename="sword_{orcid_id}.zip"' + } + ) diff --git a/backend/app/db/models.py b/backend/app/db/models.py index b86ea83..d91ec0c 100644 --- a/backend/app/db/models.py +++ b/backend/app/db/models.py @@ -1,9 +1,11 @@ -from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text +from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text, ForeignKey from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.sql import func +from sqlalchemy.orm import relationship from .session import Base import uuid + class Researcher(Base): __tablename__ = "researchers" @@ -11,18 +13,48 @@ class Researcher(Base): orcid_id = Column(String(19), unique=True, nullable=False) name = Column(Text) authenticated = Column(Boolean, default=False) - access_token = Column(Text) - last_sync_at = Column(DateTime) + access_token = Column(Text, nullable=True) + last_sync_at = Column(DateTime(timezone=True), server_default=func.now()) + + publications = relationship( + "Publication", + back_populates="researcher", + cascade="all, delete-orphan" + ) + + sync_jobs = relationship( + "SyncJob", + back_populates="researcher", + cascade="all, delete-orphan" + ) + class Publication(Base): __tablename__ = "publications" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - researcher_id = Column(UUID(as_uuid=True)) + researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id")) put_code = Column(Integer) title = Column(Text) + journal = Column(Text) doi = Column(Text) pub_year = Column(Integer) type = Column(Text) hash_fingerprint = Column(Text) - last_modified = Column(DateTime) + last_modified = Column(DateTime(timezone=True)) + + researcher = relationship("Researcher", back_populates="publications") + + +class SyncJob(Base): + __tablename__ = "sync_jobs" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id")) + status = Column(String(20)) + new_records = Column(Integer, default=0) + updated_records = Column(Integer, default=0) + started_at = Column(DateTime(timezone=True), server_default=func.now()) + finished_at = Column(DateTime(timezone=True)) + + researcher = relationship("Researcher", back_populates="sync_jobs") diff --git a/backend/app/db/session.py b/backend/app/db/session.py index bc82b42..3f63b01 100644 --- a/backend/app/db/session.py +++ b/backend/app/db/session.py @@ -4,7 +4,23 @@ import os DATABASE_URL = os.getenv("DATABASE_URL") -engine = create_engine(DATABASE_URL) -SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False) +engine = create_engine( + DATABASE_URL, + future=True, + echo=False +) + +SessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + bind=engine +) Base = declarative_base() + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/backend/app/main.py b/backend/app/main.py index c8dcfd2..402cbca 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,9 +1,32 @@ from fastapi import FastAPI -from app.services.orcid_client import ORCIDClient +from app.api.researchers import router as researchers_router +from app.db.session import Base, engine -app = FastAPI() -@app.get("/orcid/{orcid_id}/works") -def test_works(orcid_id: str): - client = ORCIDClient() - return client.fetch_works(orcid_id) +app = FastAPI( + title="ORCID SWORD Backend", + description="Backend para sincronización ORCID y exportación SWORD", + version="1.0.0" +) + + +# --------------------------------------------------------- +# Crear tablas al iniciar la aplicación +# --------------------------------------------------------- +@app.on_event("startup") +def startup_event(): + Base.metadata.create_all(bind=engine) + + +# --------------------------------------------------------- +# Healthcheck +# --------------------------------------------------------- +@app.get("/health") +def health(): + return {"status": "ok"} + + +# --------------------------------------------------------- +# Registrar routers +# --------------------------------------------------------- +app.include_router(researchers_router) diff --git a/backend/app/repositories/publication_repository.py b/backend/app/repositories/publication_repository.py new file mode 100644 index 0000000..2c64597 --- /dev/null +++ b/backend/app/repositories/publication_repository.py @@ -0,0 +1,67 @@ +from sqlalchemy.orm import Session +from app.db.models import Publication + + +class PublicationRepository: + + @staticmethod + def get_by_put_code(db: Session, researcher_id: str, put_code: int): + """ + Devuelve una publicación existente por put_code (único en ORCID). + """ + return ( + db.query(Publication) + .filter( + Publication.researcher_id == researcher_id, + Publication.put_code == put_code + ) + .first() + ) + + @staticmethod + def create(db: Session, researcher_id: str, data: dict): + """ + Crea una nueva publicación normalizada. + """ + pub = Publication( + researcher_id=researcher_id, + put_code=data["put_code"], + title=data["title"], + journal=data["journal"], + doi=data["doi"], + pub_year=data["pub_year"], + type=data["type"], + hash_fingerprint=data["hash_fingerprint"] + ) + db.add(pub) + db.commit() + db.refresh(pub) + return pub + + @staticmethod + def update(db: Session, publication: Publication, data: dict): + """ + Actualiza una publicación existente si ORCID ha cambiado algo. + """ + publication.title = data["title"] + publication.journal = data["journal"] + publication.doi = data["doi"] + publication.pub_year = data["pub_year"] + publication.type = data["type"] + publication.hash_fingerprint = data["hash_fingerprint"] + + db.commit() + db.refresh(publication) + return publication + + @staticmethod + def list_by_researcher(db: Session, researcher_id: str): + """ + Lista todas las publicaciones de un investigador. + """ + return ( + db.query(Publication) + .filter(Publication.researcher_id == researcher_id) + .order_by(Publication.pub_year.desc().nullslast()) + .all() + ) diff --git a/backend/app/repositories/researcher_repository.py b/backend/app/repositories/researcher_repository.py new file mode 100644 index 0000000..4aba7af --- /dev/null +++ b/backend/app/repositories/researcher_repository.py @@ -0,0 +1,25 @@ +from sqlalchemy.orm import Session +from app.db.models import Researcher +from sqlalchemy.sql import func + + +class ResearcherRepository: + + @staticmethod + def get_by_orcid(db: Session, orcid_id: str): + return db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first() + + @staticmethod + def create(db: Session, orcid_id: str, name: str = None): + researcher = Researcher(orcid_id=orcid_id, name=name) + db.add(researcher) + db.commit() + db.refresh(researcher) + return researcher + + @staticmethod + def update_last_sync(db: Session, researcher: Researcher): + researcher.last_sync_at = func.now() + db.commit() + db.refresh(researcher) + return researcher diff --git a/backend/app/repositories/syncjob_repository.py b/backend/app/repositories/syncjob_repository.py new file mode 100644 index 0000000..1cb00a1 --- /dev/null +++ b/backend/app/repositories/syncjob_repository.py @@ -0,0 +1,28 @@ +from sqlalchemy.orm import Session +from app.db.models import SyncJob +from sqlalchemy.sql import func + + +class SyncJobRepository: + + @staticmethod + def start_job(db: Session, researcher_id: str): + job = SyncJob( + researcher_id=researcher_id, + status="running", + started_at=func.now() + ) + db.add(job) + db.commit() + db.refresh(job) + return job + + @staticmethod + def finish_job(db: Session, job: SyncJob, new_records: int, updated_records: int): + job.status = "finished" + job.new_records = new_records + job.updated_records = updated_records + job.finished_at = func.now() + db.commit() + db.refresh(job) + return job diff --git a/backend/app/schema/publication.py b/backend/app/schema/publication.py new file mode 100644 index 0000000..b0d3710 --- /dev/null +++ b/backend/app/schema/publication.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel +from uuid import UUID + +class PublicationSchema(BaseModel): + id: UUID + put_code: int | None = None + title: str + journal: str | None = None + doi: str | None = None + pub_year: int | None = None + type: str | None = None + hash_fingerprint: str | None = None + last_modified: str | None = None + + class Config: + from_attributes = True diff --git a/backend/app/services/normalizer.py b/backend/app/services/normalizer.py new file mode 100644 index 0000000..ee5c5b2 --- /dev/null +++ b/backend/app/services/normalizer.py @@ -0,0 +1,74 @@ +class PublicationNormalizer: + + @staticmethod + def safe_get_title(summary): + t = summary.get("title") + + if t is None: + return None + + # Caso 1: {"title": {"value": "..."}} + if isinstance(t, dict) and "title" in t and isinstance(t["title"], dict): + return t["title"].get("value") + + # Caso 2: {"title": {"title": "..."}} (muy común en /works) + if isinstance(t, dict) and "title" in t and isinstance(t["title"], str): + return t["title"] + + # Caso 3: {"title": "string"} + if isinstance(t, str): + return t + + # Caso 4: {"value": "..."} + if isinstance(t, dict) and "value" in t: + return t["value"] + + return None + + @staticmethod + def normalize_work(summary: dict) -> dict: + + title = PublicationNormalizer.safe_get_title(summary) + + # Journal title + journal_raw = summary.get("journal-title") + if isinstance(journal_raw, dict): + journal = journal_raw.get("value") or journal_raw.get("title") + else: + journal = journal_raw + + # DOI + doi = None + ext_ids = summary.get("external-ids", {}).get("external-id", []) + for ext in ext_ids: + if ext.get("external-id-type") == "doi": + doi = ext.get("external-id-value") + break + + # Publication year + pub_year = ( + summary.get("publication-date", {}) + .get("year", {}) + .get("value") + ) + + # Type + work_type = summary.get("type") + + # put-code + put_code = summary.get("put-code") + + # Fingerprint + fingerprint = f"{title}-{doi}-{pub_year}-{work_type}" + if fingerprint: + fingerprint = fingerprint.lower().replace(" ", "") + + return { + "put_code": put_code, + "title": title or "Untitled", + "journal": journal, + "doi": doi, + "pub_year": pub_year, + "type": work_type, + "hash_fingerprint": fingerprint + } diff --git a/backend/app/services/orcid_client.py b/backend/app/services/orcid_client.py index 45d6f79..068bf05 100644 --- a/backend/app/services/orcid_client.py +++ b/backend/app/services/orcid_client.py @@ -2,18 +2,13 @@ import httpx import os from typing import Optional - class ORCIDClient: - """ - Cliente para interactuar con la Public API de ORCID. - Permite: - - Obtener token público - - Consultar /record - - Consultar /works - """ + + TOKEN_URL = "https://sandbox.orcid.org/oauth/token" + BASE_URL = "https://pub.sandbox.orcid.org/v3.0" - TOKEN_URL = "https://orcid.org/oauth/token" - BASE_URL = "https://pub.orcid.org/v3.0" + # TOKEN_URL = "https://orcid.org/oauth/token" + # BASE_URL = "https://pub.orcid.org/v3.0" def __init__(self): self.client_id = os.getenv("ORCID_CLIENT_ID") diff --git a/backend/app/services/sword_exporter.py b/backend/app/services/sword_exporter.py new file mode 100644 index 0000000..9912574 --- /dev/null +++ b/backend/app/services/sword_exporter.py @@ -0,0 +1,155 @@ +from datetime import datetime +from xml.etree.ElementTree import Element, SubElement, tostring +from io import BytesIO +import zipfile +import json + + +class SWORDExporter: + + ATOM_NS = "http://www.w3.org/2005/Atom" + DC_NS = "http://purl.org/dc/elements/1.1/" + + # --------------------------------------------------------- + # 1) XML PRINCIPAL (sword.xml) + # --------------------------------------------------------- + @staticmethod + def export_feed_xml(researcher, publications) -> bytes: + feed = Element("feed", xmlns=SWORDExporter.ATOM_NS) + + title = SubElement(feed, "title") + title.text = f"Publications for {researcher.orcid_id}" + + author = SubElement(feed, "author") + name = SubElement(author, "name") + name.text = researcher.name or "Unknown" + + updated = SubElement(feed, "updated") + updated.text = datetime.utcnow().isoformat() + "Z" + + feed_id = SubElement(feed, "id") + feed_id.text = f"urn:uuid:{researcher.id}" + + for pub in publications: + entry = SubElement(feed, "entry") + + entry_id = SubElement(entry, "id") + entry_id.text = f"urn:uuid:{pub.id}" + + entry_updated = SubElement(entry, "updated") + entry_updated.text = datetime.utcnow().isoformat() + "Z" + + dc_title = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}title") + dc_title.text = pub.title + + if pub.doi: + dc_identifier = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}identifier") + dc_identifier.text = f"doi:{pub.doi}" + + if pub.pub_year: + dc_date = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}date") + dc_date.text = str(pub.pub_year) + + if pub.type: + dc_type = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}type") + dc_type.text = pub.type + + if pub.journal: + dc_source = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}source") + dc_source.text = pub.journal + + xml_bytes = tostring(feed, encoding="utf-8", xml_declaration=True) + return xml_bytes + + # --------------------------------------------------------- + # 2) manifest.txt + # --------------------------------------------------------- + @staticmethod + def generate_manifest(researcher, publications) -> str: + lines = [ + "SWORD Deposit Package", + "----------------------", + f"Researcher ORCID: {researcher.orcid_id}", + f"Researcher Name: {researcher.name or 'Unknown'}", + f"Total Publications: {len(publications)}", + f"Generated At: {datetime.utcnow().isoformat()}Z", + "", + "Publications:", + ] + + for pub in publications: + lines.append(f"- {pub.title} ({pub.pub_year}) DOI={pub.doi}") + + return "\n".join(lines) + + # --------------------------------------------------------- + # 3) metadata.json + # --------------------------------------------------------- + @staticmethod + def generate_metadata_json(researcher, publications) -> str: + data = { + "researcher": { + "orcid_id": researcher.orcid_id, + "name": researcher.name, + "id": str(researcher.id), + }, + "generated_at": datetime.utcnow().isoformat() + "Z", + "publications": [ + { + "id": str(pub.id), + "title": pub.title, + "doi": pub.doi, + "year": pub.pub_year, + "type": pub.type, + "journal": pub.journal, + } + for pub in publications + ], + } + return json.dumps(data, indent=4) + + # --------------------------------------------------------- + # 4) mets.xml (versión simple) + # --------------------------------------------------------- + @staticmethod + def generate_mets_xml(researcher, publications) -> bytes: + mets = Element("mets", xmlns="http://www.loc.gov/METS/") + + header = SubElement(mets, "metsHdr") + agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER") + name = SubElement(agent, "name") + name.text = "ORCID Exporter System" + + dmd_sec = SubElement(mets, "dmdSec", ID="dmd1") + md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC") + xml_data = SubElement(md_wrap, "xmlData") + + for pub in publications: + dc_title = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}title") + dc_title.text = pub.title + + if pub.doi: + dc_id = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}identifier") + dc_id.text = f"doi:{pub.doi}" + + return tostring(mets, encoding="utf-8", xml_declaration=True) + + # --------------------------------------------------------- + # 5) ZIP FINAL + # --------------------------------------------------------- + @staticmethod + def export_zip(researcher, publications) -> bytes: + xml_bytes = SWORDExporter.export_feed_xml(researcher, publications) + manifest = SWORDExporter.generate_manifest(researcher, publications) + metadata_json = SWORDExporter.generate_metadata_json(researcher, publications) + mets_xml = SWORDExporter.generate_mets_xml(researcher, publications) + + mem_file = BytesIO() + with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: + zf.writestr("sword.xml", xml_bytes) + zf.writestr("manifest.txt", manifest) + zf.writestr("metadata.json", metadata_json) + zf.writestr("mets.xml", mets_xml) + + mem_file.seek(0) + return mem_file.read() diff --git a/backend/app/services/sync_service.py b/backend/app/services/sync_service.py new file mode 100644 index 0000000..b4477c5 --- /dev/null +++ b/backend/app/services/sync_service.py @@ -0,0 +1,96 @@ +from sqlalchemy.orm import Session +from app.services.orcid_client import ORCIDClient +from app.services.normalizer import PublicationNormalizer +from app.repositories.researcher_repository import ResearcherRepository +from app.repositories.publication_repository import PublicationRepository +from app.repositories.syncjob_repository import SyncJobRepository +import httpx + + +class SyncService: + + def __init__(self): + self.orcid_client = ORCIDClient() + + def sync_researcher(self, db: Session, orcid_id: str): + """ + Sincroniza las publicaciones de un investigador con manejo robusto de errores. + """ + + # 1. Obtener o crear investigador + try: + researcher = ResearcherRepository.get_by_orcid(db, orcid_id) + + if not researcher: + record = self.orcid_client.fetch_record(orcid_id) + name = ( + record.get("person", {}) + .get("name", {}) + .get("given-names", {}) + .get("value") + ) + researcher = ResearcherRepository.create(db, orcid_id, name) + + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + return { + "status": "error", + "message": f"El ORCID {orcid_id} no existe en Sandbox." + } + return {"status": "error", "message": str(e)} + + # 2. Crear SyncJob + job = SyncJobRepository.start_job(db, researcher.id) + + # 3. Obtener works + try: + works_raw = self.orcid_client.fetch_works(orcid_id) + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + SyncJobRepository.finish_job(db, job, 0, 0) + ResearcherRepository.update_last_sync(db, researcher) + return { + "status": "ok", + "message": "El ORCID existe pero no tiene publicaciones públicas.", + "new_records": 0, + "updated_records": 0, + "total": 0 + } + return {"status": "error", "message": str(e)} + + groups = works_raw.get("group", []) + + new_records = 0 + updated_records = 0 + + # 4. Procesar works + for group in groups: + summary = group["work-summary"][0] + normalized = PublicationNormalizer.normalize_work(summary) + + # 🔥 AHORA SE DETECTAN DUPLICADOS POR put_code + existing = PublicationRepository.get_by_put_code( + db, researcher.id, normalized["put_code"] + ) + + if existing: + PublicationRepository.update(db, existing, normalized) + updated_records += 1 + else: + PublicationRepository.create(db, researcher.id, normalized) + new_records += 1 + + # 5. Finalizar SyncJob + SyncJobRepository.finish_job(db, job, new_records, updated_records) + + # 6. Actualizar last_sync_at + ResearcherRepository.update_last_sync(db, researcher) + + return { + "status": "ok", + "message": "Sincronización completada correctamente.", + "researcher": researcher.orcid_id, + "new_records": new_records, + "updated_records": updated_records, + "total": new_records + updated_records + } diff --git a/backend/app/utils/orcid_validator.py b/backend/app/utils/orcid_validator.py new file mode 100644 index 0000000..235a88b --- /dev/null +++ b/backend/app/utils/orcid_validator.py @@ -0,0 +1,28 @@ +import re + +ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$") + + +def is_valid_orcid(orcid_id: str) -> bool: + """ + Valida un ORCID ID: + - Formato: 0000-0000-0000-0000 + - Dígito de control según ISO 7064 Mod 11-2 + """ + if not ORCID_REGEX.match(orcid_id): + return False + + # Quitar guiones + digits = orcid_id.replace("-", "") + + total = 0 + # Los primeros 15 dígitos + for char in digits[:-1]: + total = (total + int(char)) * 2 + + # Resto + remainder = total % 11 + result = (12 - remainder) % 11 + check_digit = "X" if result == 10 else str(result) + + return digits[-1] == check_digit