ORCID2SWORD/backend/app/services/repository_export.py

"""
Exportadores orientados a repositorios: Dublin Core, DSpace y EPrints.

Perfiles soportados (query `profile` en /export/sword/...):
- generic   → feed Atom ORCID (compatibilidad hacia atrás)
- dublin_core → XML con un registro DC por publicación
- dspace    → feed Atom con metadatos DSpace / Dublin Core ampliado
- eprints   → XML de importación EPrints (EP3)

El ZIP incluye todos los perfiles bajo `formats/` más SAF DSpace en `dspace-saf/`.
"""

from __future__ import annotations

import csv
import io
from datetime import datetime
from typing import Iterable, List
from xml.etree.ElementTree import Element, SubElement, tostring

from app.db.models import Publication, Researcher

ATOM_NS = "http://www.w3.org/2005/Atom"
DC_NS = "http://purl.org/dc/elements/1.1/"
DCTERMS_NS = "http://purl.org/dc/terms/"
SWORD_NS = "http://purl.org/net/sword/"
EPRINTS_NS = "http://eprints.org/ep3/data"

EXPORT_PROFILES = ("generic", "dublin_core", "dspace", "eprints")

_DSPACE_TYPE_MAP = {
    "journal-article": "Article",
    "book-chapter": "Book chapter",
    "book": "Book",
    "conference-paper": "Conference paper",
    "conference-abstract": "Conference paper",
    "dissertation-thesis": "Thesis",
    "report": "Report",
    "preprint": "Preprint",
    "other": "Other",
}

_EPRINTS_TYPE_MAP = {
    "journal-article": "article",
    "book-chapter": "book_section",
    "book": "book",
    "conference-paper": "conference_item",
    "conference-abstract": "conference_item",
    "dissertation-thesis": "thesis",
    "report": "report",
    "preprint": "preprint",
    "other": "other",
}


def normalize_profile(profile: str | None) -> str:
    value = (profile or "generic").strip().lower()
    if value not in EXPORT_PROFILES:
        raise ValueError(
            f"Invalid export profile {profile!r}. "
            f"Use one of: {', '.join(EXPORT_PROFILES)}"
        )
    return value


def publication_date_iso(pub: Publication) -> str | None:
    if not pub.pub_year:
        return None
    date_str = str(pub.pub_year)
    if pub.pub_month:
        date_str += f"-{pub.pub_month:02d}"
        if pub.pub_day:
            date_str += f"-{pub.pub_day:02d}"
    return date_str


def contributor_names(pub: Publication) -> List[str]:
    names: List[str] = []
    for item in pub.contributors or []:
        name = (item or {}).get("name")
        if name:
            names.append(str(name))
    return names


def split_person_name(full_name: str) -> tuple[str, str]:
    parts = full_name.strip().split()
    if len(parts) <= 1:
        return full_name, ""
    return parts[-1], " ".join(parts[:-1])


def dspace_type(pub: Publication) -> str:
    if not pub.type:
        return "Other"
    return _DSPACE_TYPE_MAP.get(pub.type, pub.type.replace("-", " ").title())


def eprints_type(pub: Publication) -> str:
    if not pub.type:
        return "other"
    return _EPRINTS_TYPE_MAP.get(pub.type, "other")


def _safe_text(value) -> str | None:
    if value is None:
        return None
    text = str(value).strip()
    if text.lower() == "none":
        return None
    return text or None


def _append_dc(parent: Element, tag: str, text: str | None) -> None:
    if text:
        SubElement(parent, tag).text = text


def generate_dublin_core_records_xml(
    researcher: Researcher,
    publications: Iterable[Publication],
) -> bytes:
    root = Element("dublinCoreRecords", {
        "researcherOrcid": researcher.orcid_id,
        "researcherName": researcher.name or "",
    })

    for pub in publications:
        record = SubElement(root, "record", {
            "id": str(pub.id),
            "putCode": str(pub.put_code),
        })
        _append_dc(record, f"{{{DC_NS}}}title", pub.title)
        _append_dc(record, f"{{{DC_NS}}}type", pub.type)
        _append_dc(record, f"{{{DC_NS}}}source", pub.journal)
        _append_dc(record, f"{{{DC_NS}}}language", pub.language_code)
        _append_dc(record, f"{{{DC_NS}}}coverage", pub.country)
        _append_dc(record, f"{{{DC_NS}}}description", pub.short_description or pub.subtitle)
        _append_dc(record, f"{{{DC_NS}}}bibliographicCitation", pub.citation_value)
        if pub.doi:
            _append_dc(record, f"{{{DC_NS}}}identifier", f"doi:{pub.doi}")
        if pub.url:
            _append_dc(record, f"{{{DC_NS}}}relation", pub.url)
        date_iso = publication_date_iso(pub)
        if date_iso:
            _append_dc(record, f"{{{DC_NS}}}date", date_iso)
        for name in contributor_names(pub):
            _append_dc(record, f"{{{DC_NS}}}creator", name)
        if researcher.orcid_id:
            _append_dc(record, f"{{{DC_NS}}}provenance", f"orcid:{researcher.orcid_id}")

    return tostring(root, encoding="utf-8", xml_declaration=True)


def generate_dspace_item_dublin_core(pub: Publication) -> bytes:
    root = Element("dublin_core")

    def dcvalue(element: str, qualifier: str, value: str | None) -> None:
        if value:
            SubElement(root, "dcvalue", element=element, qualifier=qualifier).text = value

    dcvalue("title", "none", pub.title)
    dcvalue("type", "none", dspace_type(pub))
    dcvalue("source", "none", pub.journal)
    dcvalue("language", "iso", pub.language_code)
    dcvalue("coverage", "spatial", pub.country)
    dcvalue("description", "abstract", pub.short_description or pub.subtitle)
    dcvalue("description", "none", pub.citation_value)
    date_iso = publication_date_iso(pub)
    if date_iso:
        dcvalue("date", "issued", date_iso)
    if pub.doi:
        dcvalue("identifier", "doi", pub.doi)
        dcvalue("identifier", "uri", f"https://doi.org/{pub.doi}")
    elif pub.url:
        dcvalue("identifier", "uri", pub.url)
    if pub.url:
        dcvalue("relation", "uri", pub.url)
    for name in contributor_names(pub):
        dcvalue("contributor", "author", name)

    return tostring(root, encoding="utf-8", xml_declaration=True)


def generate_dspace_import_csv(
    researcher: Researcher,
    publications: Iterable[Publication],
) -> str:
    output = io.StringIO()
    writer = csv.writer(output, quoting=csv.QUOTE_ALL, lineterminator="\n")
    writer.writerow([
        "row_id",
        "collection",
        "dc.title",
        "dc.contributor.author",
        "dc.date.issued",
        "dc.description",
        "dc.identifier.doi",
        "dc.identifier.uri",
        "dc.language.iso",
        "dc.publisher",
        "dc.relation.ispartof",
        "dc.source",
        "dc.type",
        "dc.provenance",
    ])

    for index, pub in enumerate(publications, start=1):
        authors = "; ".join(contributor_names(pub)) or (researcher.name or "")
        writer.writerow([
            index,
            "",
            pub.title or "",
            authors,
            publication_date_iso(pub) or "",
            pub.short_description or pub.citation_value or "",
            pub.doi or "",
            pub.url or (f"https://doi.org/{pub.doi}" if pub.doi else ""),
            pub.language_code or "",
            "",
            pub.journal or "",
            pub.journal or "",
            dspace_type(pub),
            f"orcid:{researcher.orcid_id}",
        ])

    return output.getvalue()


def generate_dspace_sword_feed_xml(
    researcher: Researcher,
    publications: Iterable[Publication],
) -> bytes:
    """
    Feed Atom orientado a ingest DSpace (metadatos DC/dcterms por entry).
    No sustituye un depósito SWORD 2.0 con bitstreams, pero alinea campos DC.
    """
    feed = Element("feed", {
        "xmlns": ATOM_NS,
        "xmlns:dc": DC_NS,
        "xmlns:dcterms": DCTERMS_NS,
        "xmlns:sword": SWORD_NS,
    })

    SubElement(feed, "title").text = f"DSpace export for {researcher.orcid_id}"
    SubElement(feed, "id").text = f"urn:uuid:{researcher.id}"
    SubElement(feed, "updated").text = datetime.utcnow().isoformat() + "Z"
    author = SubElement(feed, "author")
    SubElement(author, "name").text = researcher.name or researcher.orcid_id

    for pub in publications:
        entry = SubElement(feed, "entry")
        SubElement(entry, "title").text = pub.title or "Untitled"
        SubElement(entry, "id").text = f"urn:uuid:{pub.id}"
        SubElement(entry, "updated").text = datetime.utcnow().isoformat() + "Z"
        SubElement(entry, f"{{{SWORD_NS}}}deposit").text = "true"
        SubElement(entry, f"{{{SWORD_NS}}}noOp").text = "false"

        category = SubElement(entry, "category")
        category.set("term", dspace_type(pub))
        category.set("scheme", "http://dspace.org/itemtypes")

        if pub.title:
            SubElement(entry, f"{{{DC_NS}}}title").text = pub.title
        if pub.journal:
            SubElement(entry, f"{{{DC_NS}}}source").text = pub.journal
        if pub.doi:
            SubElement(entry, f"{{{DC_NS}}}identifier").text = f"doi:{pub.doi}"
        if pub.url:
            SubElement(entry, f"{{{DCTERMS_NS}}}relation").text = pub.url
        if pub.short_description:
            SubElement(entry, f"{{{DCTERMS_NS}}}abstract").text = pub.short_description
        if pub.citation_value:
            SubElement(entry, f"{{{DCTERMS_NS}}}bibliographicCitation").text = pub.citation_value
        if pub.language_code:
            SubElement(entry, f"{{{DC_NS}}}language").text = pub.language_code
        date_iso = publication_date_iso(pub)
        if date_iso:
            SubElement(entry, f"{{{DCTERMS_NS}}}issued").text = date_iso
        SubElement(entry, f"{{{DC_NS}}}type").text = dspace_type(pub)
        for name in contributor_names(pub):
            author_el = SubElement(entry, "author")
            SubElement(author_el, "name").text = name

    return tostring(feed, encoding="utf-8", xml_declaration=True)


def generate_eprints_import_xml(
    researcher: Researcher,
    publications: Iterable[Publication],
) -> bytes:
    root = Element("eprints", xmlns=EPRINTS_NS)
    today = datetime.utcnow().strftime("%Y-%m-%d")

    for index, pub in enumerate(publications, start=1):
        eprint = SubElement(root, "eprint")
        SubElement(eprint, "eprintid").text = str(index)
        SubElement(eprint, "rev_number").text = "1"
        SubElement(eprint, "documents")
        SubElement(eprint, "eprint_status").text = "archive"
        SubElement(eprint, "userid").text = "1"
        SubElement(eprint, "dir").text = f"disk00000/00/00/{index:02d}"
        SubElement(eprint, "datestamp").text = today
        SubElement(eprint, "lastmod").text = today
        SubElement(eprint, "status_changed").text = today
        SubElement(eprint, "type").text = eprints_type(pub)

        titles = SubElement(eprint, "titles")
        title_item = SubElement(titles, "item")
        SubElement(title_item, "lang").text = pub.language_code or "en"
        SubElement(title_item, "title").text = pub.title or "Untitled"

        creators = SubElement(eprint, "creators")
        names = contributor_names(pub) or ([researcher.name] if researcher.name else [])
        for name in names:
            family, given = split_person_name(name)
            item = SubElement(creators, "item")
            name_el = SubElement(item, "name")
            SubElement(name_el, "family").text = family
            if given:
                SubElement(name_el, "given").text = given

        if pub.pub_year:
            SubElement(eprint, "date").text = str(pub.pub_year)
        if pub.journal:
            SubElement(eprint, "publication").text = pub.journal
        if pub.doi:
            SubElement(eprint, "doi").text = pub.doi
        if pub.url:
            SubElement(eprint, "official_url").text = pub.url
        if pub.short_description:
            SubElement(eprint, "abstract").text = pub.short_description
        if pub.citation_value:
            SubElement(eprint, "note").text = pub.citation_value
        if researcher.orcid_id:
            SubElement(eprint, "note").text = f"Source ORCID: {researcher.orcid_id}"

    return tostring(root, encoding="utf-8", xml_declaration=True)


def generate_repository_xml(
    researcher: Researcher,
    publications: List[Publication],
    profile: str,
) -> bytes:
    profile = normalize_profile(profile)
    if profile == "dublin_core":
        return generate_dublin_core_records_xml(researcher, publications)
    if profile == "dspace":
        return generate_dspace_sword_feed_xml(researcher, publications)
    if profile == "eprints":
        return generate_eprints_import_xml(researcher, publications)
    from app.services.sword_generator import SWORDGenerator

    return SWORDGenerator.generate_feed_xml(researcher, publications)


def export_filename_for_profile(profile: str) -> str:
    profile = normalize_profile(profile)
    return {
        "generic": "generic-atom.xml",
        "dublin_core": "dublin_core.xml",
        "dspace": "dspace-atom.xml",
        "eprints": "eprints-import.xml",
    }[profile]