aa2e7280dc
Backend: generadores por repositorio, ZIP multi-formato y query profile en /export/sword. Frontend: selector Destino que envia profile al descargar SWORD XML.
366 lines
13 KiB
Python
366 lines
13 KiB
Python
"""
|
|
Exportadores orientados a repositorios: Dublin Core, DSpace y EPrints.
|
|
|
|
Perfiles soportados (query `profile` en /export/sword/...):
|
|
- generic → feed Atom ORCID (compatibilidad hacia atrás)
|
|
- dublin_core → XML con un registro DC por publicación
|
|
- dspace → feed Atom con metadatos DSpace / Dublin Core ampliado
|
|
- eprints → XML de importación EPrints (EP3)
|
|
|
|
El ZIP incluye todos los perfiles bajo `formats/` más SAF DSpace en `dspace-saf/`.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import io
|
|
from datetime import datetime
|
|
from typing import Iterable, List
|
|
from xml.etree.ElementTree import Element, SubElement, tostring
|
|
|
|
from app.db.models import Publication, Researcher
|
|
|
|
ATOM_NS = "http://www.w3.org/2005/Atom"
|
|
DC_NS = "http://purl.org/dc/elements/1.1/"
|
|
DCTERMS_NS = "http://purl.org/dc/terms/"
|
|
SWORD_NS = "http://purl.org/net/sword/"
|
|
EPRINTS_NS = "http://eprints.org/ep3/data"
|
|
|
|
EXPORT_PROFILES = ("generic", "dublin_core", "dspace", "eprints")
|
|
|
|
_DSPACE_TYPE_MAP = {
|
|
"journal-article": "Article",
|
|
"book-chapter": "Book chapter",
|
|
"book": "Book",
|
|
"conference-paper": "Conference paper",
|
|
"conference-abstract": "Conference paper",
|
|
"dissertation-thesis": "Thesis",
|
|
"report": "Report",
|
|
"preprint": "Preprint",
|
|
"other": "Other",
|
|
}
|
|
|
|
_EPRINTS_TYPE_MAP = {
|
|
"journal-article": "article",
|
|
"book-chapter": "book_section",
|
|
"book": "book",
|
|
"conference-paper": "conference_item",
|
|
"conference-abstract": "conference_item",
|
|
"dissertation-thesis": "thesis",
|
|
"report": "report",
|
|
"preprint": "preprint",
|
|
"other": "other",
|
|
}
|
|
|
|
|
|
def normalize_profile(profile: str | None) -> str:
|
|
value = (profile or "generic").strip().lower()
|
|
if value not in EXPORT_PROFILES:
|
|
raise ValueError(
|
|
f"Invalid export profile {profile!r}. "
|
|
f"Use one of: {', '.join(EXPORT_PROFILES)}"
|
|
)
|
|
return value
|
|
|
|
|
|
def publication_date_iso(pub: Publication) -> str | None:
|
|
if not pub.pub_year:
|
|
return None
|
|
date_str = str(pub.pub_year)
|
|
if pub.pub_month:
|
|
date_str += f"-{pub.pub_month:02d}"
|
|
if pub.pub_day:
|
|
date_str += f"-{pub.pub_day:02d}"
|
|
return date_str
|
|
|
|
|
|
def contributor_names(pub: Publication) -> List[str]:
|
|
names: List[str] = []
|
|
for item in pub.contributors or []:
|
|
name = (item or {}).get("name")
|
|
if name:
|
|
names.append(str(name))
|
|
return names
|
|
|
|
|
|
def split_person_name(full_name: str) -> tuple[str, str]:
|
|
parts = full_name.strip().split()
|
|
if len(parts) <= 1:
|
|
return full_name, ""
|
|
return parts[-1], " ".join(parts[:-1])
|
|
|
|
|
|
def dspace_type(pub: Publication) -> str:
|
|
if not pub.type:
|
|
return "Other"
|
|
return _DSPACE_TYPE_MAP.get(pub.type, pub.type.replace("-", " ").title())
|
|
|
|
|
|
def eprints_type(pub: Publication) -> str:
|
|
if not pub.type:
|
|
return "other"
|
|
return _EPRINTS_TYPE_MAP.get(pub.type, "other")
|
|
|
|
|
|
def _safe_text(value) -> str | None:
|
|
if value is None:
|
|
return None
|
|
text = str(value).strip()
|
|
if text.lower() == "none":
|
|
return None
|
|
return text or None
|
|
|
|
|
|
def _append_dc(parent: Element, tag: str, text: str | None) -> None:
|
|
if text:
|
|
SubElement(parent, tag).text = text
|
|
|
|
|
|
def generate_dublin_core_records_xml(
|
|
researcher: Researcher,
|
|
publications: Iterable[Publication],
|
|
) -> bytes:
|
|
root = Element("dublinCoreRecords", {
|
|
"researcherOrcid": researcher.orcid_id,
|
|
"researcherName": researcher.name or "",
|
|
})
|
|
|
|
for pub in publications:
|
|
record = SubElement(root, "record", {
|
|
"id": str(pub.id),
|
|
"putCode": str(pub.put_code),
|
|
})
|
|
_append_dc(record, f"{{{DC_NS}}}title", pub.title)
|
|
_append_dc(record, f"{{{DC_NS}}}type", pub.type)
|
|
_append_dc(record, f"{{{DC_NS}}}source", pub.journal)
|
|
_append_dc(record, f"{{{DC_NS}}}language", pub.language_code)
|
|
_append_dc(record, f"{{{DC_NS}}}coverage", pub.country)
|
|
_append_dc(record, f"{{{DC_NS}}}description", pub.short_description or pub.subtitle)
|
|
_append_dc(record, f"{{{DC_NS}}}bibliographicCitation", pub.citation_value)
|
|
if pub.doi:
|
|
_append_dc(record, f"{{{DC_NS}}}identifier", f"doi:{pub.doi}")
|
|
if pub.url:
|
|
_append_dc(record, f"{{{DC_NS}}}relation", pub.url)
|
|
date_iso = publication_date_iso(pub)
|
|
if date_iso:
|
|
_append_dc(record, f"{{{DC_NS}}}date", date_iso)
|
|
for name in contributor_names(pub):
|
|
_append_dc(record, f"{{{DC_NS}}}creator", name)
|
|
if researcher.orcid_id:
|
|
_append_dc(record, f"{{{DC_NS}}}provenance", f"orcid:{researcher.orcid_id}")
|
|
|
|
return tostring(root, encoding="utf-8", xml_declaration=True)
|
|
|
|
|
|
def generate_dspace_item_dublin_core(pub: Publication) -> bytes:
|
|
root = Element("dublin_core")
|
|
|
|
def dcvalue(element: str, qualifier: str, value: str | None) -> None:
|
|
if value:
|
|
SubElement(root, "dcvalue", element=element, qualifier=qualifier).text = value
|
|
|
|
dcvalue("title", "none", pub.title)
|
|
dcvalue("type", "none", dspace_type(pub))
|
|
dcvalue("source", "none", pub.journal)
|
|
dcvalue("language", "iso", pub.language_code)
|
|
dcvalue("coverage", "spatial", pub.country)
|
|
dcvalue("description", "abstract", pub.short_description or pub.subtitle)
|
|
dcvalue("description", "none", pub.citation_value)
|
|
date_iso = publication_date_iso(pub)
|
|
if date_iso:
|
|
dcvalue("date", "issued", date_iso)
|
|
if pub.doi:
|
|
dcvalue("identifier", "doi", pub.doi)
|
|
dcvalue("identifier", "uri", f"https://doi.org/{pub.doi}")
|
|
elif pub.url:
|
|
dcvalue("identifier", "uri", pub.url)
|
|
if pub.url:
|
|
dcvalue("relation", "uri", pub.url)
|
|
for name in contributor_names(pub):
|
|
dcvalue("contributor", "author", name)
|
|
|
|
return tostring(root, encoding="utf-8", xml_declaration=True)
|
|
|
|
|
|
def generate_dspace_import_csv(
|
|
researcher: Researcher,
|
|
publications: Iterable[Publication],
|
|
) -> str:
|
|
output = io.StringIO()
|
|
writer = csv.writer(output, quoting=csv.QUOTE_ALL, lineterminator="\n")
|
|
writer.writerow([
|
|
"row_id",
|
|
"collection",
|
|
"dc.title",
|
|
"dc.contributor.author",
|
|
"dc.date.issued",
|
|
"dc.description",
|
|
"dc.identifier.doi",
|
|
"dc.identifier.uri",
|
|
"dc.language.iso",
|
|
"dc.publisher",
|
|
"dc.relation.ispartof",
|
|
"dc.source",
|
|
"dc.type",
|
|
"dc.provenance",
|
|
])
|
|
|
|
for index, pub in enumerate(publications, start=1):
|
|
authors = "; ".join(contributor_names(pub)) or (researcher.name or "")
|
|
writer.writerow([
|
|
index,
|
|
"",
|
|
pub.title or "",
|
|
authors,
|
|
publication_date_iso(pub) or "",
|
|
pub.short_description or pub.citation_value or "",
|
|
pub.doi or "",
|
|
pub.url or (f"https://doi.org/{pub.doi}" if pub.doi else ""),
|
|
pub.language_code or "",
|
|
"",
|
|
pub.journal or "",
|
|
pub.journal or "",
|
|
dspace_type(pub),
|
|
f"orcid:{researcher.orcid_id}",
|
|
])
|
|
|
|
return output.getvalue()
|
|
|
|
|
|
def generate_dspace_sword_feed_xml(
|
|
researcher: Researcher,
|
|
publications: Iterable[Publication],
|
|
) -> bytes:
|
|
"""
|
|
Feed Atom orientado a ingest DSpace (metadatos DC/dcterms por entry).
|
|
No sustituye un depósito SWORD 2.0 con bitstreams, pero alinea campos DC.
|
|
"""
|
|
feed = Element("feed", {
|
|
"xmlns": ATOM_NS,
|
|
"xmlns:dc": DC_NS,
|
|
"xmlns:dcterms": DCTERMS_NS,
|
|
"xmlns:sword": SWORD_NS,
|
|
})
|
|
|
|
SubElement(feed, "title").text = f"DSpace export for {researcher.orcid_id}"
|
|
SubElement(feed, "id").text = f"urn:uuid:{researcher.id}"
|
|
SubElement(feed, "updated").text = datetime.utcnow().isoformat() + "Z"
|
|
author = SubElement(feed, "author")
|
|
SubElement(author, "name").text = researcher.name or researcher.orcid_id
|
|
|
|
for pub in publications:
|
|
entry = SubElement(feed, "entry")
|
|
SubElement(entry, "title").text = pub.title or "Untitled"
|
|
SubElement(entry, "id").text = f"urn:uuid:{pub.id}"
|
|
SubElement(entry, "updated").text = datetime.utcnow().isoformat() + "Z"
|
|
SubElement(entry, f"{{{SWORD_NS}}}deposit").text = "true"
|
|
SubElement(entry, f"{{{SWORD_NS}}}noOp").text = "false"
|
|
|
|
category = SubElement(entry, "category")
|
|
category.set("term", dspace_type(pub))
|
|
category.set("scheme", "http://dspace.org/itemtypes")
|
|
|
|
if pub.title:
|
|
SubElement(entry, f"{{{DC_NS}}}title").text = pub.title
|
|
if pub.journal:
|
|
SubElement(entry, f"{{{DC_NS}}}source").text = pub.journal
|
|
if pub.doi:
|
|
SubElement(entry, f"{{{DC_NS}}}identifier").text = f"doi:{pub.doi}"
|
|
if pub.url:
|
|
SubElement(entry, f"{{{DCTERMS_NS}}}relation").text = pub.url
|
|
if pub.short_description:
|
|
SubElement(entry, f"{{{DCTERMS_NS}}}abstract").text = pub.short_description
|
|
if pub.citation_value:
|
|
SubElement(entry, f"{{{DCTERMS_NS}}}bibliographicCitation").text = pub.citation_value
|
|
if pub.language_code:
|
|
SubElement(entry, f"{{{DC_NS}}}language").text = pub.language_code
|
|
date_iso = publication_date_iso(pub)
|
|
if date_iso:
|
|
SubElement(entry, f"{{{DCTERMS_NS}}}issued").text = date_iso
|
|
SubElement(entry, f"{{{DC_NS}}}type").text = dspace_type(pub)
|
|
for name in contributor_names(pub):
|
|
author_el = SubElement(entry, "author")
|
|
SubElement(author_el, "name").text = name
|
|
|
|
return tostring(feed, encoding="utf-8", xml_declaration=True)
|
|
|
|
|
|
def generate_eprints_import_xml(
|
|
researcher: Researcher,
|
|
publications: Iterable[Publication],
|
|
) -> bytes:
|
|
root = Element("eprints", xmlns=EPRINTS_NS)
|
|
today = datetime.utcnow().strftime("%Y-%m-%d")
|
|
|
|
for index, pub in enumerate(publications, start=1):
|
|
eprint = SubElement(root, "eprint")
|
|
SubElement(eprint, "eprintid").text = str(index)
|
|
SubElement(eprint, "rev_number").text = "1"
|
|
SubElement(eprint, "documents")
|
|
SubElement(eprint, "eprint_status").text = "archive"
|
|
SubElement(eprint, "userid").text = "1"
|
|
SubElement(eprint, "dir").text = f"disk00000/00/00/{index:02d}"
|
|
SubElement(eprint, "datestamp").text = today
|
|
SubElement(eprint, "lastmod").text = today
|
|
SubElement(eprint, "status_changed").text = today
|
|
SubElement(eprint, "type").text = eprints_type(pub)
|
|
|
|
titles = SubElement(eprint, "titles")
|
|
title_item = SubElement(titles, "item")
|
|
SubElement(title_item, "lang").text = pub.language_code or "en"
|
|
SubElement(title_item, "title").text = pub.title or "Untitled"
|
|
|
|
creators = SubElement(eprint, "creators")
|
|
names = contributor_names(pub) or ([researcher.name] if researcher.name else [])
|
|
for name in names:
|
|
family, given = split_person_name(name)
|
|
item = SubElement(creators, "item")
|
|
name_el = SubElement(item, "name")
|
|
SubElement(name_el, "family").text = family
|
|
if given:
|
|
SubElement(name_el, "given").text = given
|
|
|
|
if pub.pub_year:
|
|
SubElement(eprint, "date").text = str(pub.pub_year)
|
|
if pub.journal:
|
|
SubElement(eprint, "publication").text = pub.journal
|
|
if pub.doi:
|
|
SubElement(eprint, "doi").text = pub.doi
|
|
if pub.url:
|
|
SubElement(eprint, "official_url").text = pub.url
|
|
if pub.short_description:
|
|
SubElement(eprint, "abstract").text = pub.short_description
|
|
if pub.citation_value:
|
|
SubElement(eprint, "note").text = pub.citation_value
|
|
if researcher.orcid_id:
|
|
SubElement(eprint, "note").text = f"Source ORCID: {researcher.orcid_id}"
|
|
|
|
return tostring(root, encoding="utf-8", xml_declaration=True)
|
|
|
|
|
|
def generate_repository_xml(
|
|
researcher: Researcher,
|
|
publications: List[Publication],
|
|
profile: str,
|
|
) -> bytes:
|
|
profile = normalize_profile(profile)
|
|
if profile == "dublin_core":
|
|
return generate_dublin_core_records_xml(researcher, publications)
|
|
if profile == "dspace":
|
|
return generate_dspace_sword_feed_xml(researcher, publications)
|
|
if profile == "eprints":
|
|
return generate_eprints_import_xml(researcher, publications)
|
|
from app.services.sword_generator import SWORDGenerator
|
|
|
|
return SWORDGenerator.generate_feed_xml(researcher, publications)
|
|
|
|
|
|
def export_filename_for_profile(profile: str) -> str:
|
|
profile = normalize_profile(profile)
|
|
return {
|
|
"generic": "generic-atom.xml",
|
|
"dublin_core": "dublin_core.xml",
|
|
"dspace": "dspace-atom.xml",
|
|
"eprints": "eprints-import.xml",
|
|
}[profile]
|