Files
ORCID2SWORD/backend/app/services/zip_generator.py
T
Mireya Cueto Garrido aa2e7280dc feat(export): perfiles DSpace/EPrints/Dublin Core y selector SWORD en UI
Backend: generadores por repositorio, ZIP multi-formato y query profile en /export/sword. Frontend: selector Destino que envia profile al descargar SWORD XML.
2026-05-20 13:25:35 +02:00

191 lines
7.9 KiB
Python

import io
import json
import zipfile
from datetime import datetime
from app.db.models import Publication, Researcher
from app.services.repository_export import (
export_filename_for_profile,
generate_dspace_import_csv,
generate_dspace_item_dublin_core,
generate_dublin_core_records_xml,
generate_dspace_sword_feed_xml,
generate_eprints_import_xml,
)
from app.services.sword_generator import SWORDGenerator
class ZIPGenerator:
@staticmethod
def generate_manifest(researcher, publications):
lines = [
"ORCID Export Package",
"--------------------",
f"Researcher ORCID: {researcher.orcid_id}",
f"Researcher Name: {researcher.name}",
f"Researcher UUID: {researcher.id}",
f"Total Publications: {len(publications)}",
f"Generated At: {datetime.utcnow().isoformat()}Z",
"",
"Files:",
"- sword.xml → Atom genérico ORCID (compatibilidad)",
"- formats/generic-atom.xml",
"- formats/dublin_core.xml → Dublin Core (un registro por obra)",
"- formats/dspace-atom.xml → Atom con metadatos DSpace",
"- formats/dspace-import.csv → Importación batch CSV DSpace",
"- formats/eprints-import.xml → Importación XML EPrints",
"- dspace-saf/item_NNNNN/dublin_core.xml → Simple Archive Format (DSpace)",
"- metadata.json → Metadatos completos (JSON)",
"- mets.xml → METS simplificado (legacy)",
"",
"Repository hints:",
"- DSpace: use dspace-saf/ (SAF) or formats/dspace-import.csv",
"- EPrints: import formats/eprints-import.xml via admin tools",
"- Dublin Core: use formats/dublin_core.xml",
"",
"SWORD endpoint profile query:",
" ?profile=generic|dublin_core|dspace|eprints",
"",
"Publications:",
]
for pub in publications:
year = pub.pub_year or "Unknown"
lines.append(
f"- {pub.title} ({year}) | DOI={pub.doi} | TYPE={pub.type}"
)
return "\n".join(lines)
@staticmethod
def generate_metadata_json(researcher, publications):
data = {
"researcher": {
"orcid_id": researcher.orcid_id,
"name": researcher.name,
"id": str(researcher.id),
"last_sync_at": researcher.last_sync_at.isoformat() if researcher.last_sync_at else None,
},
"generated_at": datetime.utcnow().isoformat() + "Z",
"publications": [],
}
for pub in publications:
data["publications"].append({
"id": str(pub.id),
"put_code": pub.put_code,
"title": pub.title,
"subtitle": pub.subtitle,
"doi": pub.doi,
"journal": pub.journal,
"type": pub.type,
"url": pub.url,
"short_description": pub.short_description,
"citation_type": pub.citation_type,
"citation_value": pub.citation_value,
"language_code": pub.language_code,
"country": pub.country,
"pub_year": pub.pub_year,
"pub_month": pub.pub_month,
"pub_day": pub.pub_day,
"external_ids": pub.external_ids,
"contributors": pub.contributors,
"hash_fingerprint": pub.hash_fingerprint,
"last_modified": pub.last_modified.isoformat() if pub.last_modified else None,
"status": getattr(pub, "status", None),
})
return json.dumps(data, indent=4)
@staticmethod
def generate_mets_xml(researcher, publications):
from xml.etree.ElementTree import Element, SubElement, tostring
mets = Element("mets", xmlns="http://www.loc.gov/METS/")
header = SubElement(mets, "metsHdr")
agent = SubElement(header, "agent", ROLE="CREATOR", TYPE="OTHER")
SubElement(agent, "name").text = "ORCID Exporter System"
dmd_sec = SubElement(mets, "dmdSec", ID="dmd1")
md_wrap = SubElement(dmd_sec, "mdWrap", MDTYPE="DC")
xml_data = SubElement(md_wrap, "xmlData")
for pub in publications:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}title").text = pub.title
if pub.subtitle:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}description").text = pub.subtitle
if pub.doi:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}identifier").text = f"doi:{pub.doi}"
if pub.journal:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}source").text = pub.journal
if pub.url:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}relation").text = pub.url
if pub.short_description:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}description").text = pub.short_description
if pub.citation_value:
SubElement(
xml_data,
"{http://purl.org/dc/elements/1.1/}bibliographicCitation",
).text = pub.citation_value
if pub.language_code:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}language").text = pub.language_code
if pub.country:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}coverage").text = pub.country
if pub.pub_year:
date_str = str(pub.pub_year)
if pub.pub_month:
date_str += f"-{pub.pub_month:02d}"
if pub.pub_day:
date_str += f"-{pub.pub_day:02d}"
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}date").text = date_str
if pub.type:
SubElement(xml_data, "{http://purl.org/dc/elements/1.1/}type").text = pub.type
return tostring(mets, encoding="utf-8", xml_declaration=True)
@staticmethod
def generate_zip(researcher, publications):
generic_xml = SWORDGenerator.generate_feed_xml(researcher, publications)
manifest = ZIPGenerator.generate_manifest(researcher, publications)
metadata_json = ZIPGenerator.generate_metadata_json(researcher, publications)
mets_xml = ZIPGenerator.generate_mets_xml(researcher, publications)
format_files = {
f"formats/{export_filename_for_profile('generic')}": generic_xml,
f"formats/{export_filename_for_profile('dublin_core')}": generate_dublin_core_records_xml(
researcher, publications
),
f"formats/{export_filename_for_profile('dspace')}": generate_dspace_sword_feed_xml(
researcher, publications
),
f"formats/{export_filename_for_profile('eprints')}": generate_eprints_import_xml(
researcher, publications
),
}
mem_file = io.BytesIO()
with zipfile.ZipFile(mem_file, "w", zipfile.ZIP_DEFLATED) as zf:
zf.writestr("sword.xml", generic_xml)
zf.writestr("manifest.txt", manifest)
zf.writestr("metadata.json", metadata_json)
zf.writestr("mets.xml", mets_xml)
zf.writestr(
"formats/dspace-import.csv",
generate_dspace_import_csv(researcher, publications),
)
for path, content in format_files.items():
zf.writestr(path, content)
for index, pub in enumerate(publications, start=1):
item_dir = f"dspace-saf/item_{index:05d}"
zf.writestr(
f"{item_dir}/dublin_core.xml",
generate_dspace_item_dublin_core(pub),
)
mem_file.seek(0)
return mem_file.read()