Merge pull request #2 from uja-dev-practices/backend-v2

feat: implement researcher and publication management with sync funct…
This commit is contained in:
Mireya Cueto Garrido
2026-04-21 14:01:00 +02:00
committed by GitHub
13 changed files with 698 additions and 23 deletions
+120
View File
@@ -0,0 +1,120 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from sqlalchemy.orm import Session
from app.schema.publication import PublicationSchema
from app.db.session import get_db
from app.repositories.researcher_repository import ResearcherRepository
from app.repositories.publication_repository import PublicationRepository
from app.services.sync_service import SyncService
from app.services.sword_exporter import SWORDExporter
from app.utils.orcid_validator import is_valid_orcid
router = APIRouter(prefix="/researchers", tags=["researchers"])
def validate_orcid_or_400(orcid_id: str):
if not is_valid_orcid(orcid_id):
raise HTTPException(
status_code=400,
detail=f"ORCID ID '{orcid_id}' no es válido según el formato y dígito de control."
)
@router.post("/", response_model=dict)
def create_researcher(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
existing = ResearcherRepository.get_by_orcid(db, orcid_id)
if existing:
return {
"status": "ok",
"message": "Researcher ya existe.",
"orcid_id": existing.orcid_id,
"id": existing.id
}
# Aquí podrías opcionalmente validar que el ORCID existe en ORCID API
researcher = ResearcherRepository.create(db, orcid_id, name=None)
return {
"status": "ok",
"message": "Researcher creado correctamente.",
"orcid_id": researcher.orcid_id,
"id": researcher.id
}
@router.get("/{orcid_id}", response_model=dict)
def get_researcher(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
return {
"orcid_id": researcher.orcid_id,
"name": researcher.name,
"authenticated": researcher.authenticated,
"access_token": researcher.access_token,
"id": researcher.id,
"last_sync_at": researcher.last_sync_at,
}
@router.post("/{orcid_id}/sync", response_model=dict)
def sync_researcher(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
service = SyncService()
result = service.sync_researcher(db, orcid_id)
return result
@router.get("/{orcid_id}/publications", response_model=list[PublicationSchema], tags=["researchers"])
def get_publications(orcid_id: str, db: Session = Depends(get_db)):
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
return researcher.publications
@router.get("/{orcid_id}/export/sword.xml")
def export_sword_xml(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = PublicationRepository.list_by_researcher(db, researcher.id)
xml_bytes = SWORDExporter.export_feed_xml(researcher, pubs)
return Response(
content=xml_bytes,
media_type="application/xml",
headers={
"Content-Disposition": f'attachment; filename="sword_{orcid_id}.xml"'
}
)
@router.get("/{orcid_id}/export/sword.zip")
def export_sword_zip(orcid_id: str, db: Session = Depends(get_db)):
validate_orcid_or_400(orcid_id)
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
raise HTTPException(status_code=404, detail="Researcher not found")
pubs = PublicationRepository.list_by_researcher(db, researcher.id)
zip_bytes = SWORDExporter.export_zip(researcher, pubs)
return Response(
content=zip_bytes,
media_type="application/zip",
headers={
"Content-Disposition": f'attachment; filename="sword_{orcid_id}.zip"'
}
)
+37 -5
View File
@@ -1,9 +1,11 @@
from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text from sqlalchemy import Column, String, Boolean, Integer, DateTime, Text, ForeignKey
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.sql import func from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from .session import Base from .session import Base
import uuid import uuid
class Researcher(Base): class Researcher(Base):
__tablename__ = "researchers" __tablename__ = "researchers"
@@ -11,18 +13,48 @@ class Researcher(Base):
orcid_id = Column(String(19), unique=True, nullable=False) orcid_id = Column(String(19), unique=True, nullable=False)
name = Column(Text) name = Column(Text)
authenticated = Column(Boolean, default=False) authenticated = Column(Boolean, default=False)
access_token = Column(Text) access_token = Column(Text, nullable=True)
last_sync_at = Column(DateTime) last_sync_at = Column(DateTime(timezone=True), server_default=func.now())
publications = relationship(
"Publication",
back_populates="researcher",
cascade="all, delete-orphan"
)
sync_jobs = relationship(
"SyncJob",
back_populates="researcher",
cascade="all, delete-orphan"
)
class Publication(Base): class Publication(Base):
__tablename__ = "publications" __tablename__ = "publications"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
researcher_id = Column(UUID(as_uuid=True)) researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"))
put_code = Column(Integer) put_code = Column(Integer)
title = Column(Text) title = Column(Text)
journal = Column(Text)
doi = Column(Text) doi = Column(Text)
pub_year = Column(Integer) pub_year = Column(Integer)
type = Column(Text) type = Column(Text)
hash_fingerprint = Column(Text) hash_fingerprint = Column(Text)
last_modified = Column(DateTime) last_modified = Column(DateTime(timezone=True))
researcher = relationship("Researcher", back_populates="publications")
class SyncJob(Base):
__tablename__ = "sync_jobs"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
researcher_id = Column(UUID(as_uuid=True), ForeignKey("researchers.id"))
status = Column(String(20))
new_records = Column(Integer, default=0)
updated_records = Column(Integer, default=0)
started_at = Column(DateTime(timezone=True), server_default=func.now())
finished_at = Column(DateTime(timezone=True))
researcher = relationship("Researcher", back_populates="sync_jobs")
+18 -2
View File
@@ -4,7 +4,23 @@ import os
DATABASE_URL = os.getenv("DATABASE_URL") DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(DATABASE_URL) engine = create_engine(
SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False) DATABASE_URL,
future=True,
echo=False
)
SessionLocal = sessionmaker(
autocommit=False,
autoflush=False,
bind=engine
)
Base = declarative_base() Base = declarative_base()
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
+29 -6
View File
@@ -1,9 +1,32 @@
from fastapi import FastAPI from fastapi import FastAPI
from app.services.orcid_client import ORCIDClient from app.api.researchers import router as researchers_router
from app.db.session import Base, engine
app = FastAPI()
@app.get("/orcid/{orcid_id}/works") app = FastAPI(
def test_works(orcid_id: str): title="ORCID SWORD Backend",
client = ORCIDClient() description="Backend para sincronización ORCID y exportación SWORD",
return client.fetch_works(orcid_id) version="1.0.0"
)
# ---------------------------------------------------------
# Crear tablas al iniciar la aplicación
# ---------------------------------------------------------
@app.on_event("startup")
def startup_event():
Base.metadata.create_all(bind=engine)
# ---------------------------------------------------------
# Healthcheck
# ---------------------------------------------------------
@app.get("/health")
def health():
return {"status": "ok"}
# ---------------------------------------------------------
# Registrar routers
# ---------------------------------------------------------
app.include_router(researchers_router)
@@ -0,0 +1,67 @@
from sqlalchemy.orm import Session
from app.db.models import Publication
class PublicationRepository:
@staticmethod
def get_by_put_code(db: Session, researcher_id: str, put_code: int):
"""
Devuelve una publicación existente por put_code (único en ORCID).
"""
return (
db.query(Publication)
.filter(
Publication.researcher_id == researcher_id,
Publication.put_code == put_code
)
.first()
)
@staticmethod
def create(db: Session, researcher_id: str, data: dict):
"""
Crea una nueva publicación normalizada.
"""
pub = Publication(
researcher_id=researcher_id,
put_code=data["put_code"],
title=data["title"],
journal=data["journal"],
doi=data["doi"],
pub_year=data["pub_year"],
type=data["type"],
hash_fingerprint=data["hash_fingerprint"]
)
db.add(pub)
db.commit()
db.refresh(pub)
return pub
@staticmethod
def update(db: Session, publication: Publication, data: dict):
"""
Actualiza una publicación existente si ORCID ha cambiado algo.
"""
publication.title = data["title"]
publication.journal = data["journal"]
publication.doi = data["doi"]
publication.pub_year = data["pub_year"]
publication.type = data["type"]
publication.hash_fingerprint = data["hash_fingerprint"]
db.commit()
db.refresh(publication)
return publication
@staticmethod
def list_by_researcher(db: Session, researcher_id: str):
"""
Lista todas las publicaciones de un investigador.
"""
return (
db.query(Publication)
.filter(Publication.researcher_id == researcher_id)
.order_by(Publication.pub_year.desc().nullslast())
.all()
)
@@ -0,0 +1,25 @@
from sqlalchemy.orm import Session
from app.db.models import Researcher
from sqlalchemy.sql import func
class ResearcherRepository:
@staticmethod
def get_by_orcid(db: Session, orcid_id: str):
return db.query(Researcher).filter(Researcher.orcid_id == orcid_id).first()
@staticmethod
def create(db: Session, orcid_id: str, name: str = None):
researcher = Researcher(orcid_id=orcid_id, name=name)
db.add(researcher)
db.commit()
db.refresh(researcher)
return researcher
@staticmethod
def update_last_sync(db: Session, researcher: Researcher):
researcher.last_sync_at = func.now()
db.commit()
db.refresh(researcher)
return researcher
@@ -0,0 +1,28 @@
from sqlalchemy.orm import Session
from app.db.models import SyncJob
from sqlalchemy.sql import func
class SyncJobRepository:
@staticmethod
def start_job(db: Session, researcher_id: str):
job = SyncJob(
researcher_id=researcher_id,
status="running",
started_at=func.now()
)
db.add(job)
db.commit()
db.refresh(job)
return job
@staticmethod
def finish_job(db: Session, job: SyncJob, new_records: int, updated_records: int):
job.status = "finished"
job.new_records = new_records
job.updated_records = updated_records
job.finished_at = func.now()
db.commit()
db.refresh(job)
return job
+16
View File
@@ -0,0 +1,16 @@
from pydantic import BaseModel
from uuid import UUID
class PublicationSchema(BaseModel):
id: UUID
put_code: int | None = None
title: str
journal: str | None = None
doi: str | None = None
pub_year: int | None = None
type: str | None = None
hash_fingerprint: str | None = None
last_modified: str | None = None
class Config:
from_attributes = True
+74
View File
@@ -0,0 +1,74 @@
class PublicationNormalizer:
@staticmethod
def safe_get_title(summary):
t = summary.get("title")
if t is None:
return None
# Caso 1: {"title": {"value": "..."}}
if isinstance(t, dict) and "title" in t and isinstance(t["title"], dict):
return t["title"].get("value")
# Caso 2: {"title": {"title": "..."}} (muy común en /works)
if isinstance(t, dict) and "title" in t and isinstance(t["title"], str):
return t["title"]
# Caso 3: {"title": "string"}
if isinstance(t, str):
return t
# Caso 4: {"value": "..."}
if isinstance(t, dict) and "value" in t:
return t["value"]
return None
@staticmethod
def normalize_work(summary: dict) -> dict:
title = PublicationNormalizer.safe_get_title(summary)
# Journal title
journal_raw = summary.get("journal-title")
if isinstance(journal_raw, dict):
journal = journal_raw.get("value") or journal_raw.get("title")
else:
journal = journal_raw
# DOI
doi = None
ext_ids = summary.get("external-ids", {}).get("external-id", [])
for ext in ext_ids:
if ext.get("external-id-type") == "doi":
doi = ext.get("external-id-value")
break
# Publication year
pub_year = (
summary.get("publication-date", {})
.get("year", {})
.get("value")
)
# Type
work_type = summary.get("type")
# put-code
put_code = summary.get("put-code")
# Fingerprint
fingerprint = f"{title}-{doi}-{pub_year}-{work_type}"
if fingerprint:
fingerprint = fingerprint.lower().replace(" ", "")
return {
"put_code": put_code,
"title": title or "Untitled",
"journal": journal,
"doi": doi,
"pub_year": pub_year,
"type": work_type,
"hash_fingerprint": fingerprint
}
+5 -10
View File
@@ -2,18 +2,13 @@ import httpx
import os import os
from typing import Optional from typing import Optional
class ORCIDClient: class ORCIDClient:
"""
Cliente para interactuar con la Public API de ORCID. TOKEN_URL = "https://sandbox.orcid.org/oauth/token"
Permite: BASE_URL = "https://pub.sandbox.orcid.org/v3.0"
- Obtener token público
- Consultar /record
- Consultar /works
"""
TOKEN_URL = "https://orcid.org/oauth/token" # TOKEN_URL = "https://orcid.org/oauth/token"
BASE_URL = "https://pub.orcid.org/v3.0" # BASE_URL = "https://pub.orcid.org/v3.0"
def __init__(self): def __init__(self):
self.client_id = os.getenv("ORCID_CLIENT_ID") self.client_id = os.getenv("ORCID_CLIENT_ID")
+155
View File
@@ -0,0 +1,155 @@
from datetime import datetime
from xml.etree.ElementTree import Element, SubElement, tostring
from io import BytesIO
import zipfile
import json
class SWORDExporter:
ATOM_NS = "http://www.w3.org/2005/Atom"
DC_NS = "http://purl.org/dc/elements/1.1/"
# ---------------------------------------------------------
# 1) XML PRINCIPAL (sword.xml)
# ---------------------------------------------------------
@staticmethod
def export_feed_xml(researcher, publications) -> bytes:
feed = Element("feed", xmlns=SWORDExporter.ATOM_NS)
title = SubElement(feed, "title")
title.text = f"Publications for {researcher.orcid_id}"
author = SubElement(feed, "author")
name = SubElement(author, "name")
name.text = researcher.name or "Unknown"
updated = SubElement(feed, "updated")
updated.text = datetime.utcnow().isoformat() + "Z"
feed_id = SubElement(feed, "id")
feed_id.text = f"urn:uuid:{researcher.id}"
for pub in publications:
entry = SubElement(feed, "entry")
entry_id = SubElement(entry, "id")
entry_id.text = f"urn:uuid:{pub.id}"
entry_updated = SubElement(entry, "updated")
entry_updated.text = datetime.utcnow().isoformat() + "Z"
dc_title = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}title")
dc_title.text = pub.title
if pub.doi:
dc_identifier = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}identifier")
dc_identifier.text = f"doi:{pub.doi}"
if pub.pub_year:
dc_date = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}date")
dc_date.text = str(pub.pub_year)
if pub.type:
dc_type = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}type")
dc_type.text = pub.type
if pub.journal:
dc_source = SubElement(entry, f"{{{SWORDExporter.DC_NS}}}source")
dc_source.text = pub.journal
xml_bytes = tostring(feed, encoding="utf-8", xml_declaration=True)
return xml_bytes
# ---------------------------------------------------------
# 2) manifest.txt
# ---------------------------------------------------------
@staticmethod
def generate_manifest(researcher, publications) -> str:
lines = [
"SWORD Deposit Package",
"----------------------",
f"Researcher ORCID: {researcher.orcid_id}",
f"Researcher Name: {researcher.name or 'Unknown'}",
f"Total Publications: {len(publications)}",
f"Generated At: {datetime.utcnow().isoformat()}Z",
"",
"Publications:",
]
for pub in publications:
lines.append(f"- {pub.title} ({pub.pub_year}) DOI={pub.doi}")
return "\n".join(lines)
# ---------------------------------------------------------
# 3) metadata.json
# ---------------------------------------------------------
@staticmethod
def generate_metadata_json(researcher, publications) -> str:
data = {
"researcher": {
"orcid_id": researcher.orcid_id,
"name": researcher.name,
"id": str(researcher.id),
},
"generated_at": datetime.utcnow().isoformat() + "Z",
"publications": [
{
"id": str(pub.id),
"title": pub.title,
"doi": pub.doi,
"year": pub.pub_year,
"type": pub.type,
"journal": pub.journal,
}
for pub in publications
],
}
return json.dumps(data, indent=4)
# ---------------------------------------------------------
# 4) mets.xml (versión simple)
# ---------------------------------------------------------
@staticmethod
def generate_mets_xml(researcher, publications) -> bytes:
mets = Element("mets", xmlns="http://www.loc.gov/METS/")
header = SubElement(mets, "metsHdr")
agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER")
name = SubElement(agent, "name")
name.text = "ORCID Exporter System"
dmd_sec = SubElement(mets, "dmdSec", ID="dmd1")
md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC")
xml_data = SubElement(md_wrap, "xmlData")
for pub in publications:
dc_title = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}title")
dc_title.text = pub.title
if pub.doi:
dc_id = SubElement(xml_data, f"{{{SWORDExporter.DC_NS}}}identifier")
dc_id.text = f"doi:{pub.doi}"
return tostring(mets, encoding="utf-8", xml_declaration=True)
# ---------------------------------------------------------
# 5) ZIP FINAL
# ---------------------------------------------------------
@staticmethod
def export_zip(researcher, publications) -> bytes:
xml_bytes = SWORDExporter.export_feed_xml(researcher, publications)
manifest = SWORDExporter.generate_manifest(researcher, publications)
metadata_json = SWORDExporter.generate_metadata_json(researcher, publications)
mets_xml = SWORDExporter.generate_mets_xml(researcher, publications)
mem_file = BytesIO()
with zipfile.ZipFile(mem_file, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr("sword.xml", xml_bytes)
zf.writestr("manifest.txt", manifest)
zf.writestr("metadata.json", metadata_json)
zf.writestr("mets.xml", mets_xml)
mem_file.seek(0)
return mem_file.read()
+96
View File
@@ -0,0 +1,96 @@
from sqlalchemy.orm import Session
from app.services.orcid_client import ORCIDClient
from app.services.normalizer import PublicationNormalizer
from app.repositories.researcher_repository import ResearcherRepository
from app.repositories.publication_repository import PublicationRepository
from app.repositories.syncjob_repository import SyncJobRepository
import httpx
class SyncService:
def __init__(self):
self.orcid_client = ORCIDClient()
def sync_researcher(self, db: Session, orcid_id: str):
"""
Sincroniza las publicaciones de un investigador con manejo robusto de errores.
"""
# 1. Obtener o crear investigador
try:
researcher = ResearcherRepository.get_by_orcid(db, orcid_id)
if not researcher:
record = self.orcid_client.fetch_record(orcid_id)
name = (
record.get("person", {})
.get("name", {})
.get("given-names", {})
.get("value")
)
researcher = ResearcherRepository.create(db, orcid_id, name)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
return {
"status": "error",
"message": f"El ORCID {orcid_id} no existe en Sandbox."
}
return {"status": "error", "message": str(e)}
# 2. Crear SyncJob
job = SyncJobRepository.start_job(db, researcher.id)
# 3. Obtener works
try:
works_raw = self.orcid_client.fetch_works(orcid_id)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
SyncJobRepository.finish_job(db, job, 0, 0)
ResearcherRepository.update_last_sync(db, researcher)
return {
"status": "ok",
"message": "El ORCID existe pero no tiene publicaciones públicas.",
"new_records": 0,
"updated_records": 0,
"total": 0
}
return {"status": "error", "message": str(e)}
groups = works_raw.get("group", [])
new_records = 0
updated_records = 0
# 4. Procesar works
for group in groups:
summary = group["work-summary"][0]
normalized = PublicationNormalizer.normalize_work(summary)
# 🔥 AHORA SE DETECTAN DUPLICADOS POR put_code
existing = PublicationRepository.get_by_put_code(
db, researcher.id, normalized["put_code"]
)
if existing:
PublicationRepository.update(db, existing, normalized)
updated_records += 1
else:
PublicationRepository.create(db, researcher.id, normalized)
new_records += 1
# 5. Finalizar SyncJob
SyncJobRepository.finish_job(db, job, new_records, updated_records)
# 6. Actualizar last_sync_at
ResearcherRepository.update_last_sync(db, researcher)
return {
"status": "ok",
"message": "Sincronización completada correctamente.",
"researcher": researcher.orcid_id,
"new_records": new_records,
"updated_records": updated_records,
"total": new_records + updated_records
}
+28
View File
@@ -0,0 +1,28 @@
import re
ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[0-9X]$")
def is_valid_orcid(orcid_id: str) -> bool:
"""
Valida un ORCID ID:
- Formato: 0000-0000-0000-0000
- Dígito de control según ISO 7064 Mod 11-2
"""
if not ORCID_REGEX.match(orcid_id):
return False
# Quitar guiones
digits = orcid_id.replace("-", "")
total = 0
# Los primeros 15 dígitos
for char in digits[:-1]:
total = (total + int(char)) * 2
# Resto
remainder = total % 11
result = (12 - remainder) % 11
check_digit = "X" if result == 10 else str(result)
return digits[-1] == check_digit