ORCID2SWORD/backend/app/services/zip_generator.py

import io
import json
import zipfile
from datetime import datetime

from app.db.models import Publication, Researcher
from app.services.repository_export import (
    export_filename_for_profile,
    generate_dspace_import_csv,
    generate_dspace_item_dublin_core,
    generate_dublin_core_records_xml,
    generate_dspace_sword_feed_xml,
    generate_eprints_import_xml,
)
from app.services.sword_generator import SWORDGenerator


class ZIPGenerator:

    @staticmethod
    def generate_manifest(researcher, publications):
        lines = [
            "ORCID Export Package",
            "--------------------",
            f"Researcher ORCID: {researcher.orcid_id}",
            f"Researcher Name: {researcher.name}",
            f"Researcher UUID: {researcher.id}",
            f"Total Publications: {len(publications)}",
            f"Generated At: {datetime.utcnow().isoformat()}Z",
            "",
            "Files:",
            "- sword.xml              → Atom genérico ORCID (compatibilidad)",
            "- formats/generic-atom.xml",
            "- formats/dublin_core.xml → Dublin Core (un registro por obra)",
            "- formats/dspace-atom.xml → Atom con metadatos DSpace",
            "- formats/dspace-import.csv → Importación batch CSV DSpace",
            "- formats/eprints-import.xml → Importación XML EPrints",
            "- dspace-saf/item_NNNNN/dublin_core.xml → Simple Archive Format (DSpace)",
            "- metadata.json          → Metadatos completos (JSON)",
            "- mets.xml               → METS simplificado (legacy)",
            "",
            "Repository hints:",
            "- DSpace: use dspace-saf/ (SAF) or formats/dspace-import.csv",
            "- EPrints: import formats/eprints-import.xml via admin tools",
            "- Dublin Core: use formats/dublin_core.xml",
            "",
            "SWORD endpoint profile query:",
            "  ?profile=generic|dublin_core|dspace|eprints",
            "",
            "Publications:",
        ]

        for pub in publications:
            year = pub.pub_year or "Unknown"
            lines.append(
                f"- {pub.title} ({year}) | DOI={pub.doi} | TYPE={pub.type}"
            )

        return "\n".join(lines)

    @staticmethod
    def generate_metadata_json(researcher, publications):
        data = {
            "researcher": {
                "orcid_id": researcher.orcid_id,
                "name": researcher.name,
                "id": str(researcher.id),
                "last_sync_at": researcher.last_sync_at.isoformat() if researcher.last_sync_at else None,
            },
            "generated_at": datetime.utcnow().isoformat() + "Z",
            "publications": [],
        }

        for pub in publications:
            data["publications"].append({
                "id": str(pub.id),
                "put_code": pub.put_code,
                "title": pub.title,
                "subtitle": pub.subtitle,
                "doi": pub.doi,
                "journal": pub.journal,
                "type": pub.type,
                "url": pub.url,
                "short_description": pub.short_description,
                "citation_type": pub.citation_type,
                "citation_value": pub.citation_value,
                "language_code": pub.language_code,
                "country": pub.country,
                "pub_year": pub.pub_year,
                "pub_month": pub.pub_month,
                "pub_day": pub.pub_day,
                "external_ids": pub.external_ids,
                "contributors": pub.contributors,
                "hash_fingerprint": pub.hash_fingerprint,
                "last_modified": pub.last_modified.isoformat() if pub.last_modified else None,
                "status": getattr(pub, "status", None),
            })

        return json.dumps(data, indent=4)

    @staticmethod
    def generate_mets_xml(researcher, publications):
        from xml.etree.ElementTree import Element, SubElement, tostring

        mets = Element("mets", xmlns="http://www.loc.gov/METS/")

        header = SubElement(mets, "metsHdr")
        agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER")
        SubElement(agent, "name").text = "ORCID Exporter System"

        dmd_sec = SubElement(mets, "dmdSec", ID="dmd1")
        md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC")
        xml_data = SubElement(md_wrap, "xmlData")

        for pub in publications:
            SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}title").text = pub.title
            if pub.subtitle:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}description").text = pub.subtitle
            if pub.doi:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}identifier").text = f"doi:{pub.doi}"
            if pub.journal:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}source").text = pub.journal
            if pub.url:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}relation").text = pub.url
            if pub.short_description:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}description").text = pub.short_description
            if pub.citation_value:
                SubElement(
                    xml_data,
                    "{http://purl.org/dc/elements/1.1/}bibliographicCitation",
                ).text = pub.citation_value
            if pub.language_code:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}language").text = pub.language_code
            if pub.country:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}coverage").text = pub.country
            if pub.pub_year:
                date_str = str(pub.pub_year)
                if pub.pub_month:
                    date_str += f"-{pub.pub_month:02d}"
                if pub.pub_day:
                    date_str += f"-{pub.pub_day:02d}"
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}date").text = date_str
            if pub.type:
                SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}type").text = pub.type

        return tostring(mets, encoding="utf-8", xml_declaration=True)

    @staticmethod
    def generate_zip(researcher, publications):
        generic_xml = SWORDGenerator.generate_feed_xml(researcher, publications)
        manifest = ZIPGenerator.generate_manifest(researcher, publications)
        metadata_json = ZIPGenerator.generate_metadata_json(researcher, publications)
        mets_xml = ZIPGenerator.generate_mets_xml(researcher, publications)

        format_files = {
            f"formats/{export_filename_for_profile('generic')}": generic_xml,
            f"formats/{export_filename_for_profile('dublin_core')}": generate_dublin_core_records_xml(
                researcher, publications
            ),
            f"formats/{export_filename_for_profile('dspace')}": generate_dspace_sword_feed_xml(
                researcher, publications
            ),
            f"formats/{export_filename_for_profile('eprints')}": generate_eprints_import_xml(
                researcher, publications
            ),
        }

        mem_file = io.BytesIO()
        with zipfile.ZipFile(mem_file, "w", zipfile.ZIP_DEFLATED) as zf:
            zf.writestr("sword.xml", generic_xml)
            zf.writestr("manifest.txt", manifest)
            zf.writestr("metadata.json", metadata_json)
            zf.writestr("mets.xml", mets_xml)
            zf.writestr(
                "formats/dspace-import.csv",
                generate_dspace_import_csv(researcher, publications),
            )

            for path, content in format_files.items():
                zf.writestr(path, content)

            for index, pub in enumerate(publications, start=1):
                item_dir = f"dspace-saf/item_{index:05d}"
                zf.writestr(
                    f"{item_dir}/dublin_core.xml",
                    generate_dspace_item_dublin_core(pub),
                )

        mem_file.seek(0)
        return mem_file.read()