From 1d26085f3b2d4f55e6363b568feb456c04a9184d Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:32:05 -0300 Subject: [PATCH 01/27] =?UTF-8?q?Adiciona=20migra=C3=A7=C3=B5es=20de=20art?= =?UTF-8?q?icle,=20core/home,=20instituition,=20organization,=20pid=5Fprov?= =?UTF-8?q?ider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../0048_alter_articlesource_status.py | 32 +++++ .../0015_alter_formpage_thank_you_text.py | 22 ++++ ...alter_institution_url_alter_scimago_url.py | 23 ++++ ...alter_digitalpreservationagency_options.py | 20 +++ .../migrations/0012_alter_organization_url.py | 18 +++ .../0015_alter_xmlversion_file_xmlurl.py | 120 ++++++++++++++++++ 6 files changed, 235 insertions(+) create mode 100644 article/migrations/0048_alter_articlesource_status.py create mode 100644 core/home/migrations/0015_alter_formpage_thank_you_text.py create mode 100644 institution/migrations/0008_alter_institution_url_alter_scimago_url.py create mode 100644 journal/migrations/0059_alter_digitalpreservationagency_options.py create mode 100644 organization/migrations/0012_alter_organization_url.py create mode 100644 pid_provider/migrations/0015_alter_xmlversion_file_xmlurl.py diff --git a/article/migrations/0048_alter_articlesource_status.py b/article/migrations/0048_alter_articlesource_status.py new file mode 100644 index 000000000..e91cb3cd1 --- /dev/null +++ b/article/migrations/0048_alter_articlesource_status.py @@ -0,0 +1,32 @@ +# Generated by Django 5.2.7 on 2026-03-01 15:15 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("article", "0047_articleaffiliation_normalized_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="articlesource", + name="status", + field=models.CharField( + choices=[ + ("pending", "Pending"), + ("processing", "Processing"), + ("completed", "Completed"), + ("error", "Error"), + ("reprocess", "Reprocess"), + ("url_error", "URL Error"), + ("xml_error", "XML Error"), + ], + default="pending", + help_text="Processing status of the article source", + max_length=20, + verbose_name="Status", + ), + ), + ] diff --git a/core/home/migrations/0015_alter_formpage_thank_you_text.py b/core/home/migrations/0015_alter_formpage_thank_you_text.py new file mode 100644 index 000000000..4d4cca974 --- /dev/null +++ b/core/home/migrations/0015_alter_formpage_thank_you_text.py @@ -0,0 +1,22 @@ +# Generated by Django 5.2.7 on 2026-03-12 14:11 + +import wagtail.fields +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("home", "0014_homepagesponsor"), + ] + + operations = [ + migrations.AlterField( + model_name="formpage", + name="thank_you_text", + field=wagtail.fields.RichTextField( + blank=True, + help_text="Adicione a mensagem que será exibida após o envio do formulário.", + ), + ), + ] diff --git a/institution/migrations/0008_alter_institution_url_alter_scimago_url.py b/institution/migrations/0008_alter_institution_url_alter_scimago_url.py new file mode 100644 index 000000000..d016b3fd9 --- /dev/null +++ b/institution/migrations/0008_alter_institution_url_alter_scimago_url.py @@ -0,0 +1,23 @@ +# Generated by Django 5.2.7 on 2026-03-12 14:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("institution", "0007_alter_copyrightholder_created_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="institution", + name="url", + field=models.URLField(blank=True, null=True, verbose_name="URL"), + ), + migrations.AlterField( + model_name="scimago", + name="url", + field=models.URLField(blank=True, null=True, verbose_name="URL"), + ), + ] diff --git a/journal/migrations/0059_alter_digitalpreservationagency_options.py b/journal/migrations/0059_alter_digitalpreservationagency_options.py new file mode 100644 index 000000000..215fee0fc --- /dev/null +++ b/journal/migrations/0059_alter_digitalpreservationagency_options.py @@ -0,0 +1,20 @@ +# Generated by Django 5.2.7 on 2026-03-12 14:11 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("journal", "0058_remove_journal_doi_prefix_journal_crossref_configuration"), + ] + + operations = [ + migrations.AlterModelOptions( + name="digitalpreservationagency", + options={ + "verbose_name": "Digital Preservation Agency", + "verbose_name_plural": "Digital Preservation Agencies", + }, + ), + ] diff --git a/organization/migrations/0012_alter_organization_url.py b/organization/migrations/0012_alter_organization_url.py new file mode 100644 index 000000000..fbd3e1fed --- /dev/null +++ b/organization/migrations/0012_alter_organization_url.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.7 on 2026-03-12 14:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("organization", "0011_normaffiliation_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="organization", + name="url", + field=models.URLField(blank=True, null=True, verbose_name="URL"), + ), + ] diff --git a/pid_provider/migrations/0015_alter_xmlversion_file_xmlurl.py b/pid_provider/migrations/0015_alter_xmlversion_file_xmlurl.py new file mode 100644 index 000000000..3b6fb14c0 --- /dev/null +++ b/pid_provider/migrations/0015_alter_xmlversion_file_xmlurl.py @@ -0,0 +1,120 @@ +# Generated by Django 5.2.7 on 2026-03-01 15:15 + +import django.db.models.deletion +import pid_provider.models +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ( + "pid_provider", + "0014_remove_xmlversion_pid_provide_pid_pro_91cb7b_idx_and_more", + ), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterField( + model_name="xmlversion", + name="file", + field=models.FileField( + blank=True, + max_length=300, + null=True, + upload_to=pid_provider.models.xml_directory_path, + ), + ), + migrations.CreateModel( + name="XMLURL", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, verbose_name="Creation date" + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, verbose_name="Last update date" + ), + ), + ("url", models.URLField(max_length=500, verbose_name="URL")), + ( + "status", + models.CharField( + blank=True, max_length=50, null=True, verbose_name="Status" + ), + ), + ( + "pid", + models.CharField( + blank=True, max_length=23, null=True, verbose_name="Article PID" + ), + ), + ( + "zipfile", + models.FileField( + blank=True, + max_length=300, + null=True, + upload_to=pid_provider.models.xml_url_zipfile_path, + verbose_name="ZIP File", + ), + ), + ( + "exceptions", + models.CharField( + blank=True, max_length=255, null=True, verbose_name="Exceptions" + ), + ), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + options={ + "verbose_name": "XML URL", + "verbose_name_plural": "XML URLs", + "ordering": ["-updated", "-created"], + "indexes": [ + models.Index(fields=["url"], name="pid_provide_url_d6fff2_idx"), + models.Index( + fields=["status"], name="pid_provide_status_39bead_idx" + ), + models.Index(fields=["pid"], name="pid_provide_pid_588ddc_idx"), + ], + }, + ), + ] From 49013bd6c8d29f31f6c14f7e07dc18f416222cd5 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:32:19 -0300 Subject: [PATCH 02/27] =?UTF-8?q?journal:=20corrige=20duplica=C3=A7=C3=A3o?= =?UTF-8?q?=20da=20constante=20UPDATE=5FPOLICY=5FTYPE=20em=20choices.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove a primeira definição duplicada de UPDATE_POLICY_TYPE, mantendo apenas a versão final com todos os tipos de política de atualização: correction, retraction, partial-retraction, withdrawal, expression-of-concern e other. --- journal/choices.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/journal/choices.py b/journal/choices.py index 9c17c14aa..47a94140b 100644 --- a/journal/choices.py +++ b/journal/choices.py @@ -173,3 +173,12 @@ ("expression-of-concern", _("Expression of Concern")), ("other", _("Other")), ] + +UPDATE_POLICY_TYPE = [ + ("correction", _("Correction")), + ("retraction", _("Retraction")), + ("partial-retraction", _("Partial Retraction")), + ("withdrawal", _("Withdrawal")), + ("expression-of-concern", _("Expression of Concern")), + ("other", _("Other")), +] From 69631e41529e0e57669644e87658b69c0be63f32 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:32:19 -0300 Subject: [PATCH 03/27] =?UTF-8?q?i18n:=20atualiza=20tradu=C3=A7=C3=A3o=20e?= =?UTF-8?q?m=20espanhol=20de=20'Publisher'=20para=20'Entidad=20Editora'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- locale/es/LC_MESSAGES/django.po | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locale/es/LC_MESSAGES/django.po b/locale/es/LC_MESSAGES/django.po index 8d90f9948..eeade342a 100644 --- a/locale/es/LC_MESSAGES/django.po +++ b/locale/es/LC_MESSAGES/django.po @@ -627,7 +627,7 @@ msgid "Localization" msgstr "Localización" msgid "Publisher" -msgstr "Editor" +msgstr "Entidad Editora" msgid "SciELO Book" msgstr "Libro SciELO" From ff49ab06676750e58bf6f769548a98f0fe8172a9 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:32:19 -0300 Subject: [PATCH 04/27] collection: adiciona cached_property base_url ao modelo Collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduz a propriedade base_url que retorna o domínio da coleção pronto para compor URLs, adicionando o prefixo 'https://' quando o campo domain não contém protocolo. Usado para substituir o uso direto de collection.domain com concatenação manual de 'http://' nos índices de busca e demais consumidores. --- collection/models.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/collection/models.py b/collection/models.py index 038a2e9ea..73d8e5f94 100755 --- a/collection/models.py +++ b/collection/models.py @@ -1,4 +1,5 @@ import logging +from functools import cached_property from django.db import models from django.utils.translation import gettext_lazy as _ @@ -186,6 +187,13 @@ class Meta: ), ] + @cached_property + def base_url(self): + """Retorna o domain pronto para compor URLs, adicionando protocolo se ausente.""" + if self.domain and not self.domain.startswith(("http://", "https://")): + return f"https://{self.domain}" + return self.domain + @property def data(self): d = { From 33989222132de3bd99e836ec038d92b8a7b868bf Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:32:35 -0300 Subject: [PATCH 05/27] core: move CommonControlFieldViewSet de viewsets.py para views.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolida CommonControlFieldViewSet no módulo views.py ao lado de CommonControlFieldCreateView, eliminando o arquivo viewsets.py que passa a ser removido. O comportamento do ViewSet base (delegação para form.save_all quando disponível) permanece idêntico. Atualiza o import em pid_provider/wagtail_hooks.py para apontar para core.views. --- core/views.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/core/views.py b/core/views.py index 412b40789..73778af7e 100644 --- a/core/views.py +++ b/core/views.py @@ -1,8 +1,18 @@ from django.http import HttpResponseRedirect from wagtail_modeladmin.views import CreateView +from wagtail.snippets.views.snippets import SnippetViewSet class CommonControlFieldCreateView(CreateView): def form_valid(self, form): self.object = form.save_all(self.request.user) return HttpResponseRedirect(self.get_success_url()) + + +class CommonControlFieldViewSet(SnippetViewSet): + """ViewSet base com save_instance compartilhado""" + + def save_instance(self, instance, form, is_new): + if hasattr(form, 'save_all'): + return form.save_all(self.request.user) + return super().save_instance(instance, form, is_new) \ No newline at end of file From d556d84a5c3c2bea6c490ba87a9175d3f8006be0 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:34:21 -0300 Subject: [PATCH 06/27] =?UTF-8?q?pid=5Fprovider:=20adiciona=20XMLURL,=20re?= =?UTF-8?q?fatora=20deduplica=C3=A7=C3=A3o=20e=20melhora=20XMLVersion/PidP?= =?UTF-8?q?roviderXML?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XMLVersion: - max_length=300 em file para caminhos longos. - get_or_create: trata AttributeError/TypeError/ValueError ao verificar existência do arquivo em disco; usa pid_provider_xml.v3 como nome de arquivo em vez de sps_pkg_name. PidProviderXML: - Novo classmethod get_by_pid_v3(pid_v3, partial_pid_v2, pid_v2): busca flexível por v3/v2/v2__contains; em caso de múltiplos resultados retorna o mais recente. - fix_pid_v2: passa a usar get_by_pid_v3 no lugar de objects.get(v3=). - find_duplicated_pkg_names: retorna list(set(...)) em vez de values_list flat, evitando duplicatas no resultado. - Remove find_duplicated_v2 (não mais utilizado). - Novo classmethod mark_items_as_duplicated(issns): marca em bulk todos os PidProviderXML com pkg_name duplicado como PPXML_STATUS_DUPLICATED. - deduplicate_items: simplificado — delega apenas para fix_duplicated_pkg_name, removendo lógica de v2. - Renomeia fix_duplicated_items → fix_duplicated_pkg_name e remove o filtro por v2; adiciona tratamento de exceção com UnexpectedEvent. - Novo método de instância fix_pkg_name(pkg_name): atualiza pkg_name quando difere do valor atual. XMLURL (novo modelo): - Rastreia URLs processadas pelo PidProvider com status, pid, zipfile comprimido e exceptions (máx. 255 chars). - Métodos: get, create, create_or_update, save_file (gera ZIP em memória com zipfile.ZipFile e salva via ContentFile). - xml_url_zipfile_path: função de upload path baseada em hash da URL. - Função auxiliar _truncate_traceback em base_pid_provider. Alterações na migração: pid_provider: migration 0015 — amplia max_length de XMLVersion.file e cria modelo XMLURL - XMLVersion.file: aumenta max_length de padrão para 300 para suportar caminhos de arquivo mais longos. - Cria o novo modelo XMLURL com os campos: url (URLField 500), status, pid, zipfile (FileField 300, upload via xml_url_zipfile_path), exceptions (CharField 255) e chaves estrangeiras de auditoria (creator / updated_by). Índices criados em url, status e pid. --- pid_provider/models.py | 284 +++++++++++++++++++++++++++++++---------- 1 file changed, 220 insertions(+), 64 deletions(-) diff --git a/pid_provider/models.py b/pid_provider/models.py index c32b41684..28384fcc5 100644 --- a/pid_provider/models.py +++ b/pid_provider/models.py @@ -1,8 +1,10 @@ +import io import json import logging import os import sys import traceback +import zipfile from datetime import datetime from functools import lru_cache, cached_property from zlib import crc32 @@ -108,7 +110,7 @@ class XMLVersion(CommonControlField): pid_provider_xml = models.ForeignKey( "PidProviderXML", null=True, blank=True, on_delete=models.SET_NULL ) - file = models.FileField(upload_to=xml_directory_path, null=True, blank=True) + file = models.FileField(upload_to=xml_directory_path, null=True, blank=True, max_length=300) finger_print = models.CharField(max_length=64, null=True, blank=True) class Meta: @@ -198,16 +200,17 @@ def get(cls, pid_provider_xml, finger_print): def get_or_create(cls, user, pid_provider_xml, xml_with_pre): try: latest = cls.get(pid_provider_xml, xml_with_pre.finger_print) - if not os.path.isfile(latest.file.path): - try: - filename = xml_with_pre.sps_pkg_name - except Exception as e: - filename = pid_provider_xml.v3 - latest.save_file( - f"{filename}.xml", - xml_with_pre.tostring(pretty_print=True), - ) - latest.save() + try: + file_exist = os.path.isfile(latest.file.path) + except (AttributeError, TypeError, ValueError) as e: + file_exist = False + if file_exist: + return latest + latest.save_file( + f"{pid_provider_xml.v3}.xml", + xml_with_pre.tostring(pretty_print=True), + ) + latest.save() return latest except cls.DoesNotExist: return cls.create( @@ -1265,12 +1268,26 @@ def is_registered( response.update({"error_msg": str(e), "error_type": str(type(e))}) return response return {} + + @classmethod + def get_by_pid_v3(cls, pid_v3, partial_pid_v2=None, pid_v2=None): + params = {} + if pid_v3: + params["v3"] = pid_v3 + if pid_v2: + params["v2"] = pid_v2 + if partial_pid_v2: + params["v2__contains"] = partial_pid_v2 + try: + return cls.objects.get(**params) + except cls.MultipleObjectsReturned as e: + return cls.objects.filter(**params).order_by("-updated").first() @classmethod @profile_classmethod def fix_pid_v2(cls, user, pid_v3, correct_pid_v2): try: - item = cls.objects.get(v3=pid_v3) + item = cls.get_by_pid_v3(pid_v3) except cls.DoesNotExist as e: raise cls.DoesNotExist(f"{e}: {pid_v3}") @@ -1315,7 +1332,7 @@ def mark_items_as_invalid(cls, issns): @profile_classmethod def find_duplicated_pkg_names(cls, issns): # Busca em ambos os campos de ISSN - return ( + duplicates = ( cls.objects.filter(Q(issn_print__in=issns) | Q(issn_electronic__in=issns)) .exclude(pkg_name__isnull=True) .exclude(pkg_name="") @@ -1328,31 +1345,25 @@ def find_duplicated_pkg_names(cls, issns): .values("pkg_name") .annotate(count=Count("id")) .filter(count__gt=1) - .values_list("pkg_name", flat=True) ) - + return list(set(item["pkg_name"] for item in duplicates)) + + @classmethod @profile_classmethod - def find_duplicated_v2(cls, issns): - # Busca em ambos os campos de ISSN - return ( - cls.objects.filter(Q(issn_print__in=issns) | Q(issn_electronic__in=issns)) - .exclude(v2__isnull=True) - .exclude(v2="") - .exclude( - proc_status__in=[ - choices.PPXML_STATUS_DUPLICATED, - choices.PPXML_STATUS_INVALID, - ] - ) - .values("v2") - .annotate(count=Count("id")) - .filter(count__gt=1) - .values_list("v2", flat=True) + def mark_items_as_duplicated(cls, issns): + ppx_duplicated_pkg_names = PidProviderXML.find_duplicated_pkg_names(issns) + if not ppx_duplicated_pkg_names: + return + cls.objects.filter(pkg_name__in=ppx_duplicated_pkg_names).exclude( + proc_status=choices.PPXML_STATUS_DUPLICATED + ).update( + proc_status=choices.PPXML_STATUS_DUPLICATED, ) + return ppx_duplicated_pkg_names @classmethod @profile_classmethod - def deduplicate_items(cls, user, issns, mark_as_duplicated=False, deduplicate=False): + def deduplicate_items(cls, user, issns): """ Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos. @@ -1360,50 +1371,26 @@ def deduplicate_items(cls, user, issns, mark_as_duplicated=False, deduplicate=Fa issns: Lista de ISSNs para verificar duplicatas. user: Usuário que está executando a operação. """ - duplicated_v2 = cls.find_duplicated_v2(issns) - if duplicated_v2.exists(): - if mark_as_duplicated: - cls.objects.filter(v2__in=duplicated_v2).exclude( - proc_status=choices.PPXML_STATUS_DUPLICATED - ).update( - proc_status=choices.PPXML_STATUS_DUPLICATED, - ) - if deduplicate: - for v2 in duplicated_v2: - cls.fix_duplicated_items(user, None, v2) - duplicated_pkg_names = cls.find_duplicated_pkg_names(issns) - if duplicated_pkg_names.exists(): - if mark_as_duplicated: - cls.objects.filter(pkg_name__in=duplicated_pkg_names).exclude( - proc_status=choices.PPXML_STATUS_DUPLICATED - ).update( - proc_status=choices.PPXML_STATUS_DUPLICATED, - ) - if deduplicate: - for pkg_name in duplicated_pkg_names: - cls.fix_duplicated_items(user, pkg_name, None) + for pkg_name in duplicated_pkg_names: + cls.fix_duplicated_pkg_name(pkg_name, user) + return duplicated_pkg_names @classmethod @profile_classmethod - def fix_duplicated_items(cls, user, pkg_name, v2): + def fix_duplicated_pkg_name(cls, pkg_name, user): """ Corrige items marcados como PPXML_STATUS_DUPLICATED com base no pkg_name fornecido. Args: - user: Usuário que está executando a operação. pkg_name: Nome do pacote para verificar duplicatas. - v2: Valor do pid v2 para verificar duplicatas. + user: Usuário que está executando a operação. + Returns: int: Número de items atualizados. """ try: - filters = Q() - if v2: - filters |= Q(v2=v2) | Q(other_pid__pid_in_xml=v2) - if pkg_name: - filters |= Q(pkg_name=pkg_name) - items = cls.objects.filter(filters) + items = cls.objects.filter(pkg_name=pkg_name) if items.count() <= 1: return 0 @@ -1439,10 +1426,19 @@ def fix_duplicated_items(cls, user, pkg_name, v2): UnexpectedEvent.create( exception=exception, exc_traceback=exc_traceback, - action="pid_provider.models.PidProviderXML.fix_duplicated_items", + action="pid_provider.models.PidProviderXML.fix_duplicated_pkg_name", detail=pkg_name, ) + def fix_pkg_name(self, pkg_name): + if not pkg_name: + pkg_name = self.xml_with_pre.sps_pkg_name + if pkg_name and self.pkg_name != pkg_name: + self.pkg_name = pkg_name + self.save() + return True + return False + class FixPidV2(CommonControlField): """ @@ -1583,3 +1579,163 @@ def get_or_create( fixed_in_core=None, fixed_in_upload=None, ) + + +def xml_url_zipfile_path(instance, filename): + """ + Generate the upload path for XMLURL zipfile. + + Args: + instance: XMLURL instance + filename: Name of the file + + Returns: + Path string for file upload + """ + # Use URL hash to create a unique subdirectory + url_hash = abs(hash(instance.url)) % (10 ** 8) + return f"pid_provider/xmlurl/{url_hash}/{filename}" + + +class XMLURL(CommonControlField): + """ + Model to store URLs that experienced failures and should be retried in the future. + + This model tracks URLs that failed during processing, along with their status + and associated article PID, enabling retry mechanisms to reprocess them later. + + Fields: + url: URLField - The URL that needs to be retried + status: CharField - To control the request status (e.g., "pending", "failed", "retrying") + pid: CharField - Article PID associated with this URL + zipfile: FileField - Compressed XML content retrieved from the URL + exceptions: CharField - Exception traceback information (truncated to 255 chars if needed) + """ + + url = models.URLField( + _("URL"), max_length=500, null=False, blank=False + ) + status = models.CharField( + _("Status"), max_length=50, null=True, blank=True + ) + pid = models.CharField( + _("Article PID"), max_length=23, null=True, blank=True + ) + zipfile = models.FileField( + _("ZIP File"), upload_to=xml_url_zipfile_path, null=True, blank=True, max_length=300, + ) + exceptions = models.CharField( + _("Exceptions"), max_length=255, null=True, blank=True + ) + + base_form_class = CoreAdminModelForm + + panels = [ + FieldPanel("url"), + FieldPanel("status"), + FieldPanel("pid"), + FieldPanel("zipfile"), + FieldPanel("exceptions"), + ] + + class Meta: + ordering = ["-updated", "-created"] + verbose_name = _("XML URL") + verbose_name_plural = _("XML URLs") + + indexes = [ + models.Index(fields=["url"]), + models.Index(fields=["status"]), + models.Index(fields=["pid"]), + ] + + def __str__(self): + return f"{self.url} - {self.status}" + + @classmethod + def get(cls, url=None): + if url: + return cls.objects.get(url=url) + raise ValueError("XMLURL.get() requires a url parameter") + + @classmethod + def create( + cls, + user, + url=None, + status=None, + pid=None, + exceptions=None, + ): + try: + obj = cls() + obj.url = url + obj.status = status + obj.pid = pid + obj.exceptions = exceptions + obj.creator = user + obj.save() + return obj + except IntegrityError: + return cls.get(url) + + @classmethod + def create_or_update( + cls, + user, + url=None, + status=None, + pid=None, + exceptions=None, + ): + try: + obj = cls.get(url=url) + obj.updated_by = user + if status is not None: + obj.status = status + if pid is not None: + obj.pid = pid + if exceptions is not None: + obj.exceptions = exceptions + obj.save() + return obj + except cls.DoesNotExist: + return cls.create( + user, + url, + status, + pid, + exceptions, + ) + + def save_file(self, xml_content, filename=None): + """ + Create a zip file from XML content and save it to the zipfile field. + + Args: + xml_content: str or bytes - The XML content to compress + filename: str - Optional filename for the XML inside the zip (defaults to 'content.xml') + + Returns: + bool - True if file was saved successfully, False otherwise + """ + try: + # Convert string to bytes if needed + if isinstance(xml_content, str): + xml_content = xml_content.encode('utf-8') + + # Create in-memory zip file + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + # Use provided filename or default + xml_filename = filename or 'content.xml' + zip_file.writestr(xml_filename, xml_content) + + # Save the zip file to the model + zip_filename = f"{self.pid or 'unknown'}_{self.pk or 'new'}.zip" + self.zipfile.save(zip_filename, ContentFile(zip_buffer.getvalue()), save=True) + + return True + except Exception as e: + logging.error(f"Error saving zip file for XMLURL {self.url}: {e}") + return False From 337f979333b11cb06191009e2e1c2497d83ba7ea Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:34:33 -0300 Subject: [PATCH 07/27] =?UTF-8?q?pid=5Fprovider:=20refatora=20provide=5Fpi?= =?UTF-8?q?d=5Ffor=5Fxml=5Furi=20com=20tratamento=20de=20erros=20em=20tr?= =?UTF-8?q?=C3=AAs=20categorias?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Divide o tratamento de exceções em três casos distintos: a) Falha ao obter o XML (XMLWithPre.create lança exceção): registra XMLURL com status 'xml_fetch_failed', sem pid nem zipfile. b) XML obtido mas criação do PidProviderXML falhou (resposta contém error_type ou error_message): registra XMLURL com status 'pid_provider_xml_failed', salva zipfile comprimido do XML obtido. c) Erro inesperado no bloco de registro: registra em UnexpectedEvent (comportamento anterior preservado). Cada caso é extraído em método privado dedicado: _handle_xml_fetch_failure, _handle_pid_provider_failure, _register_success e _handle_unexpected_error. Adiciona função auxiliar _truncate_traceback(tb_str, max_length=255) para garantir que tracebacks caibam no campo CharField(max_length=255) do modelo XMLURL. --- pid_provider/base_pid_provider.py | 155 ++++++++++++++++++++++++------ 1 file changed, 128 insertions(+), 27 deletions(-) diff --git a/pid_provider/base_pid_provider.py b/pid_provider/base_pid_provider.py index 26d497b15..14033f9ff 100644 --- a/pid_provider/base_pid_provider.py +++ b/pid_provider/base_pid_provider.py @@ -1,5 +1,5 @@ -import logging import sys +import traceback # from django.utils.translation import gettext_lazy as _ from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre, get_xml_with_pre @@ -7,10 +7,33 @@ from core.utils.profiling_tools import ( # ajuste o import conforme sua estrutura profile_method, ) -from pid_provider.models import PidProviderXML +from pid_provider.models import PidProviderXML, XMLURL from tracker.models import UnexpectedEvent +def _truncate_traceback(tb_str, max_length=255): + """ + Truncate traceback string to fit in max_length. + If longer than max_length, keep start and end portions. + + Args: + tb_str: Traceback string (can be None) + max_length: Maximum length (default 255) + + Returns: + Truncated traceback string or None + """ + if tb_str is None: + return None + + if len(tb_str) <= max_length: + return tb_str + + # Keep beginning and end with "..." in the middle + keep_chars = (max_length - 5) // 2 # Reserve 5 chars for " ... " + return tb_str[:keep_chars] + " ... " + tb_str[-keep_chars:] + + class BasePidProvider: def __init__(self): self.caller = None @@ -159,38 +182,25 @@ def provide_pid_for_xml_uri( ): """ Fornece / Valida PID de um XML disponível por um URI + + This method handles three types of exceptions: + a) Failure to obtain XML - registers only URL, status, and PID in XMLURL + b) Successfully obtain XML but fail to create PidProviderXML record - + registers everything + saves compressed XML content + c) Unexpected errors - logs in UnexpectedEvent Returns ------- dict """ + # a) Try to obtain XML from URI try: xml_with_pre = list(XMLWithPre.create(uri=xml_uri))[0] except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - detail = dict( - error_msg=str(e), - error_type=str(exc_type), - exc_value=str(exc_value), - exc_traceback=str(exc_traceback), - ) - UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - detail={ - "operation": "PidProvider.provide_pid_for_xml_uri", - "input": dict( - xml_uri=xml_uri, - user=user.username, - name=name, - origin_date=origin_date, - force_update=force_update, - is_published=is_published, - ), - }, - ) - return detail - else: + return self._handle_xml_fetch_failure(e, xml_uri, name, user, origin_date, force_update, is_published) + + # b) Try to create PidProviderXML record + try: response = self.provide_pid_for_xml_with_pre( xml_with_pre, name, @@ -202,8 +212,99 @@ def provide_pid_for_xml_uri( registered_in_core=registered_in_core, auto_solve_pid_conflict=auto_solve_pid_conflict, ) - + + # Handle response based on success or failure + if response.get("error_type") or response.get("error_message"): + self._handle_pid_provider_failure(response, xml_with_pre, xml_uri, name, user, origin_date, force_update, is_published) + else: + self._register_success(xml_with_pre, xml_uri, name, user, response) + return response + + except Exception as e: + return self._handle_unexpected_error(e, xml_uri, name, user, origin_date, force_update, is_published) + + def _handle_xml_fetch_failure(self, exception, xml_uri, name, user, origin_date, force_update, is_published): + """Handle exception type a) - Failure to obtain XML""" + exc_type, exc_value, exc_traceback = sys.exc_info() + + # Get traceback and truncate if needed + tb_str = traceback.format_exc() + truncated_tb = _truncate_traceback(tb_str) + + # Store exception in XMLURL instead of UnexpectedEvent + XMLURL.create_or_update( + user=user, + url=xml_uri, + status="xml_fetch_failed", + pid=None, + exceptions=truncated_tb, + ) + + return dict( + error_msg=str(exception), + error_type=str(exc_type), + exc_value=str(exc_value), + exc_traceback=str(exc_traceback), + ) + + def _handle_pid_provider_failure(self, response, xml_with_pre, xml_uri, name, user, origin_date, force_update, is_published): + """Handle exception type b) - XML obtained but PidProviderXML creation failed""" + # Format error information from response (not from an exception context) + error_msg = response.get("error_message", "Unknown error") + error_type = response.get("error_type", "Unknown") + error_info = f"{error_type}: {error_msg}" + truncated_error = _truncate_traceback(error_info) + + # Create or update XMLURL with exception info and save zipfile + xmlurl_obj = XMLURL.create_or_update( + user=user, + url=xml_uri, + status="pid_provider_xml_failed", + pid=response.get("id") or response.get("v3"), + exceptions=truncated_error, + ) + # Use XMLWithPre.tostring() method + xmlurl_obj.save_file(xml_with_pre.tostring(), filename=name or 'content.xml') + + def _register_success(self, xml_with_pre, xml_uri, name, user, response): + """Register successful XML processing in XMLURL""" + xmlurl_obj = XMLURL.create_or_update( + user=user, + url=xml_uri, + status="success", + pid=response.get("v3"), + ) + # Use XMLWithPre.tostring() method + xmlurl_obj.save_file(xml_with_pre.tostring(), filename=name or 'content.xml') + + def _handle_unexpected_error(self, exception, xml_uri, name, user, origin_date, force_update, is_published): + """Handle exception type c) - Unexpected error during processing""" + exc_type, exc_value, exc_traceback = sys.exc_info() + + UnexpectedEvent.create( + exception=exception, + exc_traceback=exc_traceback, + detail={ + "operation": "PidProvider.provide_pid_for_xml_uri", + "exception_type": "unexpected_error", + "input": dict( + xml_uri=xml_uri, + user=user.username, + name=name, + origin_date=origin_date, + force_update=force_update, + is_published=is_published, + ), + }, + ) + + return dict( + error_msg=str(exception), + error_type=str(exc_type), + exc_value=str(exc_value), + exc_traceback=str(exc_traceback), + ) @classmethod @profile_method From 79332ddb3cdf5f86a5775be6e9f2e8041c99a898 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:34:51 -0300 Subject: [PATCH 08/27] pid_provider: substitui objects.get por get_by_pid_v3 em harvesting.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit provide_pid_for_opac_and_am_xml passa a usar PidProviderXML.get_by_pid_v3 para busca por pid_v3 e pid_v2 de forma unificada, eliminando dois blocos separados de objects.get. Adiciona retorno explícito None no caminho de erro ao final da função. --- pid_provider/sources/harvesting.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pid_provider/sources/harvesting.py b/pid_provider/sources/harvesting.py index f04966cbd..ea3f73c29 100644 --- a/pid_provider/sources/harvesting.py +++ b/pid_provider/sources/harvesting.py @@ -35,13 +35,11 @@ def provide_pid_for_opac_and_am_xml( if not force_update: try: - if pid_v3: - pid_xml = PidProviderXML.objects.get(v3=pid_v3) - if pid_v2: - pid_xml = PidProviderXML.objects.get(v2=pid_v2) - return pid_xml.data + pid_xml = PidProviderXML.get_by_pid_v3(pid_v3=pid_v3, pid_v2=pid_v2) + if pid_xml: + return pid_xml.data except PidProviderXML.DoesNotExist: - pass + pid_xml = None detail = dict( pid_v2=pid_v2, @@ -84,3 +82,4 @@ def provide_pid_for_opac_and_am_xml( ), }, ) + return None From 1242b12c5573723571e2da2f015f610e34d2881e Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:34:51 -0300 Subject: [PATCH 09/27] pid_provider: atualiza import de CommonControlFieldViewSet para core.views --- pid_provider/wagtail_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pid_provider/wagtail_hooks.py b/pid_provider/wagtail_hooks.py index 833e87f6e..228742585 100644 --- a/pid_provider/wagtail_hooks.py +++ b/pid_provider/wagtail_hooks.py @@ -5,7 +5,7 @@ from wagtail.snippets.views.snippets import SnippetViewSetGroup from config.menu import get_menu_order -from core.viewsets import CommonControlFieldViewSet +from core.views import CommonControlFieldViewSet from pid_provider.models import XMLVersion, FixPidV2, OtherPid, PidProviderConfig, PidProviderXML From ccacb0a9f1deb8b8e11d5758b60a50d81cd6218b Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:34:52 -0300 Subject: [PATCH 10/27] pid_provider: adiciona testes para XMLURL e BasePidProvider.provide_pid_for_xml_uri MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XMLURLTest: - test_create_xmlurl: verifica criação com url, status, pid e creator. - test_get_xmlurl: verifica recuperação pelo campo url. - test_create_or_update_existing: verifica atualização de registro existente. - test_create_or_update_new: verifica criação quando não existe. - test_save_file_with_string_content: verifica geração de ZIP a partir de string. - test_save_file_with_bytes_content: verifica geração de ZIP a partir de bytes. - test_save_file_default_filename: verifica nome de arquivo padrão no ZIP. - test_str_method: verifica representação string do modelo. BasePidProviderXMLURITest: - test_provide_pid_for_xml_uri_fetch_failure: simula falha de rede e verifica XMLURL com status 'xml_fetch_failed'. - test_provide_pid_for_xml_uri_success: simula registro bem-sucedido e verifica XMLURL com status 'success' e pid preenchido. - test_provide_pid_for_xml_uri_registration_failure: simula falha de registro e verifica XMLURL com status 'pid_provider_xml_failed' e zipfile salvo. --- pid_provider/test_models.py | 225 ++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/pid_provider/test_models.py b/pid_provider/test_models.py index 2a21801eb..09626540c 100644 --- a/pid_provider/test_models.py +++ b/pid_provider/test_models.py @@ -802,3 +802,228 @@ def test_register_with_success( self.assertIsNone(result["updated"]) self.assertIsNotNone(result["created"]) mock_pid_request_save.assert_not_called() + + +class XMLURLTest(TestCase): + """Tests for XMLURL model""" + + def setUp(self): + self.user = User.objects.create_user(username="testuser", password="testpass") + self.test_url = "http://example.com/article.xml" + self.test_pid = "ABC123XYZ456" + + def test_create_xmlurl(self): + """Test creating a new XMLURL instance""" + xmlurl = models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + self.assertIsNotNone(xmlurl) + self.assertEqual(xmlurl.url, self.test_url) + self.assertEqual(xmlurl.status, "pending") + self.assertEqual(xmlurl.pid, self.test_pid) + self.assertEqual(xmlurl.creator, self.user) + + def test_get_xmlurl(self): + """Test getting an XMLURL by URL""" + models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + xmlurl = models.XMLURL.get(url=self.test_url) + self.assertIsNotNone(xmlurl) + self.assertEqual(xmlurl.url, self.test_url) + + def test_create_or_update_existing(self): + """Test updating an existing XMLURL""" + # Create initial record + xmlurl = models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=None, + ) + + # Update it + updated_xmlurl = models.XMLURL.create_or_update( + user=self.user, + url=self.test_url, + status="success", + pid=self.test_pid, + ) + + self.assertEqual(updated_xmlurl.id, xmlurl.id) + self.assertEqual(updated_xmlurl.status, "success") + self.assertEqual(updated_xmlurl.pid, self.test_pid) + self.assertEqual(updated_xmlurl.updated_by, self.user) + + def test_create_or_update_new(self): + """Test creating a new XMLURL when it doesn't exist""" + xmlurl = models.XMLURL.create_or_update( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + self.assertIsNotNone(xmlurl) + self.assertEqual(xmlurl.url, self.test_url) + self.assertEqual(xmlurl.status, "pending") + + def test_save_file_with_string_content(self): + """Test save_file method with string XML content""" + xmlurl = models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + xml_content = "
Test Article
" + result = xmlurl.save_file(xml_content, filename="test.xml") + + self.assertTrue(result) + self.assertTrue(xmlurl.zipfile.name) + + def test_save_file_with_bytes_content(self): + """Test save_file method with bytes XML content""" + xmlurl = models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + xml_content = b"
Test Article
" + result = xmlurl.save_file(xml_content, filename="test.xml") + + self.assertTrue(result) + self.assertTrue(xmlurl.zipfile.name) + + def test_save_file_default_filename(self): + """Test save_file method with default filename""" + xmlurl = models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + xml_content = "
Test Article
" + result = xmlurl.save_file(xml_content) + + self.assertTrue(result) + self.assertTrue(xmlurl.zipfile.name) + + def test_str_method(self): + """Test __str__ method""" + xmlurl = models.XMLURL.create( + user=self.user, + url=self.test_url, + status="pending", + pid=self.test_pid, + ) + + expected_str = f"{self.test_url} - pending" + self.assertEqual(str(xmlurl), expected_str) + + +class BasePidProviderXMLURITest(TestCase): + """Tests for BasePidProvider.provide_pid_for_xml_uri method""" + + def setUp(self): + self.user = User.objects.create_user(username="testuser", password="testpass") + + @patch("pid_provider.base_pid_provider.XMLWithPre.create") + def test_provide_pid_for_xml_uri_fetch_failure(self, mock_create): + """Test exception type a) - Failure to obtain XML""" + from pid_provider.base_pid_provider import BasePidProvider + + # Mock XMLWithPre.create to raise an exception + mock_create.side_effect = Exception("Connection timeout") + + provider = BasePidProvider() + result = provider.provide_pid_for_xml_uri( + xml_uri="http://example.com/article.xml", + name="test.xml", + user=self.user, + ) + + # Should return error details + self.assertIn("error_msg", result) + self.assertIn("error_type", result) + + # Should create XMLURL with failed status + xmlurl = models.XMLURL.get(url="http://example.com/article.xml") + self.assertEqual(xmlurl.status, "xml_fetch_failed") + self.assertIsNone(xmlurl.pid) + + @patch("pid_provider.base_pid_provider.XMLWithPre.create") + @patch.object(models.PidProviderXML, "register") + def test_provide_pid_for_xml_uri_success(self, mock_register, mock_create): + """Test successful processing with XMLURL creation""" + from pid_provider.base_pid_provider import BasePidProvider + + # Mock XMLWithPre.create + xml_with_pre = _get_xml_with_pre("
Test
") + mock_create.return_value = [xml_with_pre] + + # Mock successful registration + mock_register.return_value = { + "v3": "test_v3_pid", + "v2": "test_v2_pid", + "created": datetime.now(), + } + + provider = BasePidProvider() + result = provider.provide_pid_for_xml_uri( + xml_uri="http://example.com/article.xml", + name="test.xml", + user=self.user, + ) + + # Should return success response + self.assertEqual(result.get("v3"), "test_v3_pid") + + # Should create XMLURL with success status + xmlurl = models.XMLURL.get(url="http://example.com/article.xml") + self.assertEqual(xmlurl.status, "success") + self.assertEqual(xmlurl.pid, "test_v3_pid") + + @patch("pid_provider.base_pid_provider.XMLWithPre.create") + @patch.object(models.PidProviderXML, "register") + def test_provide_pid_for_xml_uri_registration_failure(self, mock_register, mock_create): + """Test exception type b) - XML obtained but registration failed""" + from pid_provider.base_pid_provider import BasePidProvider + + # Mock XMLWithPre.create + xml_with_pre = _get_xml_with_pre("
Test
") + mock_create.return_value = [xml_with_pre] + + # Mock failed registration + mock_register.return_value = { + "error_type": "ValidationError", + "error_msg": "Invalid XML structure", + "v3": "test_v3_pid", + } + + provider = BasePidProvider() + result = provider.provide_pid_for_xml_uri( + xml_uri="http://example.com/article2.xml", + name="test2.xml", + user=self.user, + ) + + # Should return error response + self.assertIn("error_type", result) + + # Should create XMLURL with failed status and save zipfile + xmlurl = models.XMLURL.get(url="http://example.com/article2.xml") + self.assertEqual(xmlurl.status, "pid_provider_xml_failed") + self.assertEqual(xmlurl.pid, "test_v3_pid") From d059d829709378780c23ae5250817b09eb0a3373 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 11:35:05 -0300 Subject: [PATCH 11/27] =?UTF-8?q?article:=20refatora=20ArticleSource,=20ad?= =?UTF-8?q?iciona=20exce=C3=A7=C3=B5es=20tipadas=20e=20melhora=20Article/A?= =?UTF-8?q?rticleAffiliation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exceções novas: - RequestXMLException: erro não-recuperável ao requisitar XML (NonRetryableError). - XMLException: erro genérico ao processar conteúdo XML. - UnableToRegisterPIDError: falha ao registrar PID no PidProvider. Article: - Remove método complete_data (lógica internalizada no ArticleSource). - is_pp_xml_valid: usa PidProviderXML.get_by_pid_v3 em vez de objects.get(v3=); trata DoesNotExist atribuindo None. ArticleSource: - Novos StatusChoices: URL_ERROR e XML_ERROR. - Novo cached_property xml_with_pre: tenta obter XMLWithPre de pid_provider_xml, depois do arquivo local e por último da URL. - cached_property sps_pkg_name: delega para xml_with_pre.sps_pkg_name. - request_xml: remove parâmetro opcional detail/force_update; lança RequestXMLException para NonRetryableError e XMLException para demais. - Renomeia complete_data → add_pid_provider: pipeline em duas etapas (request_xml + request_pid) com skip inteligente quando arquivo/pid já existem e force_update=False; tratamento separado para XMLException (→ XML_ERROR), RequestXMLException (→ URL_ERROR) e exceções genéricas (→ ERROR). - Renomeia get_or_create_pid_v3 → request_pid: usa get_by_pid_v3; lança UnableToRegisterPIDError quando v3 não retornado. - create / create_or_update: adicionam parâmetro auto_solve_pid_conflict e delegam para add_pid_provider; simplificam fluxo removendo chamadas diretas a request_xml e save(). - is_completed: adiciona logging detalhado em cada condição de retorno False. - mark_as_url_error / mark_as_xml_error: novos métodos de marcação. - Métodos mark_as_completed/error/reprocess: sem alteração de interface. ArticleAffiliation: - Adiciona autocomplete_label e autocomplete_custom_queryset_filter para suporte ao widget de autocompletar no admin. --- article/models.py | 271 +++++++++++++++++++++++++++++----------------- 1 file changed, 171 insertions(+), 100 deletions(-) diff --git a/article/models.py b/article/models.py index 34bc5ac4a..ff10cd59b 100755 --- a/article/models.py +++ b/article/models.py @@ -3,7 +3,7 @@ import sys import traceback from datetime import datetime -from functools import lru_cache, cached_property +from functools import cached_property from django.core.files.base import ContentFile from django.db import IntegrityError, models @@ -20,6 +20,7 @@ from wagtail.admin.panels import FieldPanel, InlinePanel, ObjectList, TabbedInterface from wagtail.models import Orderable from wagtailautocomplete.edit_handlers import AutocompletePanel +from packtools.sps.libs.requester import NonRetryableError from article import choices from article.utils.url_builder import ArticleURLBuilder @@ -52,6 +53,18 @@ from vocabulary.models import Keyword +class RequestXMLException(Exception): + """Exceção personalizada para erros na requisição de XML""" + pass + +class XMLException(Exception): + """Exceção personalizada para erros na requisição de XML""" + pass + +class UnableToRegisterPIDError(Exception): + """Exceção personalizada para erros ao registrar PID""" + pass + class AMArticle(BaseLegacyRecord): """ Modelo que representa a coleta de dados de Issue na API Article Meta. @@ -566,17 +579,6 @@ def mark_as_completed(self, user=None): self.save() # Salvar estado final self.pp_xml.mark_as_done() - def complete_data(self, pp_xml, save=False): - if pp_xml: - if not self.sps_pkg_name: - self.sps_pkg_name = pp_xml.pkg_name - save = True - if not self.pp_xml: - self.pp_xml = pp_xml - save = True - if save: - self.save() - def set_date_pub(self, dates, save=True): if dates: self.pub_date_day = dates.get("day") @@ -901,9 +903,10 @@ def mark_items_as_invalid(cls, journal=None, journal_id=None): def is_pp_xml_valid(self): if not self.pp_xml: try: - self.pp_xml = PidProviderXML.objects.get(v3=self.pid_v3) + self.pp_xml = PidProviderXML.get_by_pid_v3(pid_v3=self.pid_v3) except PidProviderXML.DoesNotExist: - pass + self.pp_xml = None + if not self.pp_xml or not self.pp_xml.xml_with_pre: if self.data_status != choices.DATA_STATUS_INVALID: self.data_status = choices.DATA_STATUS_INVALID @@ -1686,6 +1689,8 @@ class StatusChoices(models.TextChoices): COMPLETED = "completed", _("Completed") ERROR = "error", _("Error") REPROCESS = "reprocess", _("Reprocess") + URL_ERROR = "url_error", _("URL Error") + XML_ERROR = "xml_error", _("XML Error") url = models.URLField( verbose_name=_("Article URL"), @@ -1793,7 +1798,7 @@ def get(cls, url): raise ValueError("ArticleSource.get requires url") @classmethod - def create(cls, user, url=None, source_date=None, am_article=None): + def create(cls, user, url=None, source_date=None, am_article=None, force_update=None, auto_solve_pid_conflict=False): if not url: raise ValueError("ArticleSource.create requires url") @@ -1804,69 +1809,83 @@ def create(cls, user, url=None, source_date=None, am_article=None): obj.source_date = source_date obj.am_article = am_article obj.status = cls.StatusChoices.PENDING - obj.save() - try: - obj.request_xml(detail=[]) - except Exception as e: - pass + obj.add_pid_provider(user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict) return obj except IntegrityError: return cls.get(url=url) @classmethod def create_or_update( - cls, user, url=None, source_date=None, am_article=None, force_update=None + cls, user, url=None, source_date=None, am_article=None, force_update=None, auto_solve_pid_conflict=False ): try: logging.info( f"ArticleSource.create_or_update {url} {source_date} {am_article} {force_update}" ) obj = cls.get(url=url) - if ( force_update or (source_date and source_date != obj.source_date) or not obj.is_completed ): - logging.info(f"updating source: {(source_date, obj.source_date)}") - logging.info(f"updating am_article: {(am_article, obj.am_article)}") - logging.info( - f"updating file: {not obj.file or not obj.file.path or not os.path.isfile(obj.file.path)}" - ) - obj.request_xml() obj.updated_by = user obj.source_date = source_date obj.am_article = am_article - obj.status = cls.StatusChoices.REPROCESS - obj.save() - + obj.add_pid_provider(user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict) + return obj except cls.DoesNotExist: - obj = cls.create( - user, url=url, source_date=source_date, am_article=am_article + return cls.create( + user, + url=url, + source_date=source_date, + am_article=am_article, + force_update=force_update, + auto_solve_pid_conflict=auto_solve_pid_conflict ) - return obj + + @cached_property + def xml_with_pre(self): + if self.pid_provider_xml: + try: + return self.pid_provider_xml.xml_with_pre + except AttributeError: + pass + if self.file and self.file.path and os.path.isfile(self.file.path): + try: + return XMLWithPre.from_file(self.file.path) + except Exception as e: + pass + if self.url: + try: + return list(XMLWithPre.create(uri=self.url))[0] + except Exception as e: + pass @cached_property def sps_pkg_name(self): try: - xml_with_pre = list(XMLWithPre.create(path=self.file.path))[0] - except: - xml_with_pre = list(XMLWithPre.create(uri=self.url))[0] - return xml_with_pre.sps_pkg_name + return self.xml_with_pre.sps_pkg_name + except Exception: + pass - def request_xml(self, detail=None, force_update=False): + def request_xml(self, detail): if not self.url: raise ValueError("URL is required") - if force_update or not self.is_completed: - if detail: - detail.append("create file") - - logging.info(f"ArticleSource.request_xml for {self.url}") + logging.info(f"ArticleSource.request_xml for {self.url}") + try: xml_with_pre = list(XMLWithPre.create(uri=self.url))[0] self.save_file( - f"{self.sps_pkg_name}.xml", xml_with_pre.tostring(pretty_print=True) + f"{xml_with_pre.sps_pkg_name}.xml", xml_with_pre.tostring(pretty_print=True) ) + except NonRetryableError as e: + raise RequestXMLException( + f"Non-retryable error while requesting XML: {e}" + ) from e + except Exception as e: + raise XMLException( + f"Error while requesting XML: {e}" + ) from e def save_file(self, filename, content): try: @@ -1891,6 +1910,16 @@ def mark_as_error(self): self.status = self.StatusChoices.ERROR self.save() + def mark_as_url_error(self): + """Marca como erro de URL""" + self.status = self.StatusChoices.URL_ERROR + self.save() + + def mark_as_xml_error(self): + """Marca como erro de XML""" + self.status = self.StatusChoices.XML_ERROR + self.save() + def mark_for_reprocess(self): """Marca para reprocessamento""" self.status = self.StatusChoices.REPROCESS @@ -1957,83 +1986,107 @@ def get_queryset_to_complete_data( @property def is_completed(self): if not self.pid_provider_xml: + logging.info(f"Not completed: ArticleSource {self.url} has no pid_provider_xml") return False - if not self.pid_provider_xml.xml_with_pre: - return False + try: + if not self.pid_provider_xml.xml_with_pre: + logging.info(f"Not completed: ArticleSource {self.url} has pid_provider_xml but no xml_with_pre") + return False + except Exception: + pass if not self.am_article: + logging.info(f"Not completed: ArticleSource {self.url} has no am_article") return False if not self.file: + logging.info(f"Not completed: ArticleSource {self.url} has no file") return False if not self.file.path or not os.path.isfile(self.file.path): + logging.info(f"Not completed: ArticleSource {self.url} has file path invalid or file does not exist") return False if self.status != ArticleSource.StatusChoices.COMPLETED: self.status = ArticleSource.StatusChoices.COMPLETED self.save() + logging.info(f"Completed: ArticleSource {self.url} is completed") return True - def complete_data(self, user, force_update=False, auto_solve_pid_conflict=False): + def add_pid_provider(self, user, force_update=False, auto_solve_pid_conflict=False): """ - Processa um arquivo XML de artigo científico, criando ou atualizando os dados necessários. - - Este método gerencia todo o fluxo de processamento de um XML de artigo, incluindo: - - Download/criação do arquivo XML se necessário - - Geração de PID (Persistent Identifier) através do PidProvider - - Args: - user: Usuário responsável pelo processamento - force_update (bool): Se True, força a atualização mesmo se os dados já existem - auto_solve_pid_conflict (bool): Se True, resolve automaticamente conflitos de PID - - Raises: - ValueError: Se a URL não estiver definida - - Note: - O método atualiza os seguintes atributos do objeto: - - status: Estado do processamento (PENDING, COMPLETED, ERROR) - - file: Arquivo XML baixado/criado - - pid_provider_xml: Objeto PidProviderXML associado - - detail: Lista com detalhes do processamento + Executa o pipeline de obtenção de XML e registro de PID para este + ArticleSource. Evita refazer etapas já concluídas, a menos que + ``force_update=True``. + + Etapas: + 1. request_xml — baixa o XML da URL e salva em ``self.file`` + • Pula se ``self.file`` já existe em disco E ``force_update`` é False + 2. request_pid — registra o XML no PidProvider e associa ``self.pid_provider_xml`` + • Pula se ``self.pid_provider_xml`` já está associado E ``force_update`` é False + + Se uma etapa anterior falhou (ex: tem arquivo mas não tem + pid_provider_xml), somente a etapa faltante é executada. """ - try: - # Lista para armazenar detalhes do processamento detail = [] - if not force_update: - if self.is_completed: - return - - # Define status inicial como pendente self.status = ArticleSource.StatusChoices.PENDING - if not self.file.path or not os.path.isfile(self.file.path): - self.request_xml(detail, force_update) - - pid_v3 = self.get_or_create_pid_v3( - user, detail, force_update, auto_solve_pid_conflict + # --- Etapa 1: request_xml --- + has_valid_file = ( + self.file + and self.file.name + and os.path.isfile(self.file.path) ) - if not pid_v3: - raise ValueError("Failed to obtain or create PID v3") + + if force_update or not has_valid_file: + logging.info(f"Requesting XML for {self.url}") + self.request_xml(detail) + logging.info(f"XML requested successfully for {self.url}") + else: + logging.info( + f"Skipping request_xml: file already exists for {self.url}" + ) + detail.append("request_xml skipped (file already exists)") + + # --- Etapa 2: request_pid --- + has_pid_provider = self.pid_provider_xml is not None + + if force_update or not has_pid_provider: + logging.info(f"Requesting PID for {self.url}") + self.request_pid( + user, detail, force_update, auto_solve_pid_conflict + ) + logging.info( + f"PID requested successfully for {self.pid_provider_xml}" + ) + else: + logging.info( + f"Skipping request_pid: pid_provider_xml already set " + f"for {self.url}" + ) + detail.append("request_pid skipped (pid_provider_xml already set)") + self.detail = detail - self.mark_as_completed() # Marca o processamento como concluído + self.mark_as_completed() + logging.info(f"ArticleSource {self.status}") + except XMLException as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + detail.append(str({"error_type": str(type(e)), "error_message": str(e)})) + self.detail = detail + self.mark_as_xml_error() + logging.info(f"ArticleSource {self.url} marked as XML error") + except RequestXMLException as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + detail.append(str({"error_type": str(type(e)), "error_message": str(e)})) + self.detail = detail + self.mark_as_url_error() + logging.info(f"ArticleSource {self.url} marked as URL error") except Exception as e: - # Registra a exceção no log logging.exception(e) - - # Obtém informações detalhadas da exceção exc_type, exc_value, exc_traceback = sys.exc_info() - - # Adiciona informações do erro aos detalhes detail.append(str({"error_type": str(type(e)), "error_message": str(e)})) self.detail = detail - - # Marca o processamento como erro self.mark_as_error() - def get_or_create_pid_v3(self, user, detail, force_update, auto_solve_pid_conflict): - if self.pid_provider_xml and self.pid_provider_xml.xml_with_pre: - if not force_update: - return self.pid_provider_xml.v3 + def request_pid(self, user, detail, force_update, auto_solve_pid_conflict): try: detail.append("create pid_provider_xml") @@ -2055,12 +2108,12 @@ def get_or_create_pid_v3(self, user, detail, force_update, auto_solve_pid_confli # Obtém a primeira resposta (assumindo apenas uma) response = list(responses)[0] v3 = response.get("v3") - if v3: # Associa o PidProviderXML ao ArticleSource - self.pid_provider_xml = PidProviderXML.objects.get(v3=v3) + self.pid_provider_xml = PidProviderXML.get_by_pid_v3(v3) + if not self.pid_provider_xml: + raise UnableToRegisterPIDError("Failed to obtain or create PID v3") detail.append("set pid_provider_xml") - return v3 else: # Registra erro se não conseguiu obter v3 detail.append(str(response)) @@ -2071,14 +2124,13 @@ def get_or_create_pid_v3(self, user, detail, force_update, auto_solve_pid_confli exception=e, exc_traceback=exc_traceback, detail=dict( - function="article.models.ArticleSource.get_or_create_pid_v3", + function="article.models.ArticleSource.request_pid", article_source_id=self.id, - sps_pkg_name=self.sps_pkg_name, url=self.url, ), ) detail.append(str(unexpected_event.data)) - raise e + raise UnableToRegisterPIDError(str(e)) class ArticleAvailability(CommonControlField): @@ -2341,6 +2393,25 @@ class Meta: models.Index(fields=["article", "organization"]), ] + def autocomplete_label(self): + if self.organization: + return f"{self.article} - {self.organization}" + if self.raw_institution_name: + return f"{self.article} - {self.raw_institution_name}" + if self.raw_text: + return f"{self.article} - {self.raw_text}" + return f"{self.article} - Affiliation" + + @staticmethod + def autocomplete_custom_queryset_filter(search_term): + return ArticleAffiliation.objects.filter( + Q(raw_text__icontains=search_term) + | Q(raw_institution_name__icontains=search_term) + | Q(raw_country_name__icontains=search_term) + | Q(raw_state_name__icontains=search_term) + | Q(raw_city_name__icontains=search_term) + ) + def __str__(self): if self.organization: return f"{self.article} - {self.organization}" From f6a8147988c9361c105c53d9f9cf8d5d3266b9ff Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:27:33 -0300 Subject: [PATCH 12/27] =?UTF-8?q?article:=20usa=20get=5Fby=5Fpid=5Fv3=20e?= =?UTF-8?q?=20armazena=20sps=5Fpkg=5Fname=20em=20vari=C3=A1vel=20local=20e?= =?UTF-8?q?m=20xmlsps.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit load_article: - Resolve pp_xml por v3 antes do bloco try usando get_by_pid_v3, eliminando o elif v3 dentro do try. - Extrai xml_with_pre.sps_pkg_name em variável local sps_pkg_name para evitar acessos repetidos ao atributo nos logs e atribuições. --- article/sources/xmlsps.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/article/sources/xmlsps.py b/article/sources/xmlsps.py index 405f5278d..2bcd41fd5 100755 --- a/article/sources/xmlsps.py +++ b/article/sources/xmlsps.py @@ -111,12 +111,15 @@ def load_article(user, xml=None, file_path=None, v3=None, pp_xml=None): "load_article() requires params: pp_xml or v3 or file_path or xml" ) + if not pp_xml and v3: + try: + pp_xml = PidProviderXML.get_by_pid_v3(pid_v3=v3) + except PidProviderXML.DoesNotExist: + pp_xml = None + try: if pp_xml: xml_with_pre = pp_xml.xml_with_pre - elif v3: - pp_xml = PidProviderXML.objects.get(v3=v3) - xml_with_pre = pp_xml.xml_with_pre elif file_path: for xml_with_pre in XMLWithPre.create(file_path): xmltree = xml_with_pre.xmltree @@ -163,15 +166,16 @@ def load_article(user, xml=None, file_path=None, v3=None, pp_xml=None): event = None xmltree = xml_with_pre.xmltree - logging.info(f"Article {pid_v3} {xml_with_pre.sps_pkg_name}") + sps_pkg_name = xml_with_pre.sps_pkg_name + logging.info(f"Article {pid_v3} {sps_pkg_name}") # CRIAÇÃO/OBTENÇÃO DO OBJETO PRINCIPAL article = Article.create_or_update( user=user, pid_v3=pid_v3, - sps_pkg_name=xml_with_pre.sps_pkg_name, + sps_pkg_name=sps_pkg_name, ) - logging.info(f"...Article {pid_v3} {xml_with_pre.sps_pkg_name}") + logging.info(f"...Article {pid_v3} {sps_pkg_name}") article.events.all().delete() event = article.add_event(user, _("load article")) @@ -180,7 +184,7 @@ def load_article(user, xml=None, file_path=None, v3=None, pp_xml=None): article.valid = False article.data_status = choices.DATA_STATUS_PENDING article.pp_xml = pp_xml - article.sps_pkg_name = xml_with_pre.sps_pkg_name + article.sps_pkg_name = sps_pkg_name # CAMPOS SIMPLES EXTRAÍDOS DO XML set_pids(xmltree=xmltree, article=article, errors=errors) @@ -213,7 +217,7 @@ def load_article(user, xml=None, file_path=None, v3=None, pp_xml=None): # Salvar uma vez após definir todos os campos simples logging.info( - f"Saving article {article.pid_v3} {xml_with_pre.sps_pkg_name} {xml_with_pre.main_doi}" + f"Saving article {article.pid_v3} {sps_pkg_name} {xml_with_pre.main_doi}" ) add_data_availability_status( From fefde02c6c8e103cd39b80c8142ecf7fcf11344c Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:27:33 -0300 Subject: [PATCH 13/27] =?UTF-8?q?article:=20adiciona=20campos=20ao=20?= =?UTF-8?q?=C3=ADndice=20de=20busca=20e=20corrige=20URLs=20para=20usar=20c?= =?UTF-8?q?ollection.base=5Furl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ArticleIndex — novos campos: - issn (MultiValueField): ISSN eletrônico, impresso e ISSN-L do periódico. - license (CharField): tipo de licença do artigo. - aff_country / aff_institution (MultiValueField): países e instituições das afiliações para filtro geográfico/institucional. - open_access (CharField): status OA do periódico. - indexed_at (MultiValueField): bases de indexação do periódico. - crossmark_active (BooleanField): indica se Crossmark está ativo. ArticleOAIIndex — novos campos: - issn: ISSN oficial via metadata.dc.relation. - publisher: nomes dos editores via metadata.dc.publisher. - orcid: ORCIDs dos autores via metadata.dc.contributor.orcid. - format_: formato do documento via metadata.dc.format. - prepare_date: simplificado para retornar obj.pub_date diretamente. Correção de URLs (ArticleIndex e ArticleOAIIndex): - Substitui 'http://%s' % collection.domain por '%s' % collection.base_url em fulltext_pdf_*, fulltext_html_*, urls e identifier, garantindo protocolo correto (https) e evitando duplicação de 'http://'. --- article/search_indexes.py | 106 ++++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 17 deletions(-) diff --git a/article/search_indexes.py b/article/search_indexes.py index c03a5cc99..48507e1d0 100644 --- a/article/search_indexes.py +++ b/article/search_indexes.py @@ -45,6 +45,25 @@ class ArticleIndex(indexes.SearchIndex, indexes.Indexable): ta_cluster = indexes.CharField(null=True) year_cluster = indexes.CharField(null=True) + # ISSNs — busca direta por ISSN + issn = indexes.MultiValueField(null=True) + + # Licença do artigo + license = indexes.CharField(null=True) + + # Afiliações (países e instituições para filtro geográfico/institucional) + aff_country = indexes.MultiValueField(null=True) + aff_institution = indexes.MultiValueField(null=True) + + # Status OA do periódico + open_access = indexes.CharField(null=True) + + # Bases de indexação do periódico + indexed_at = indexes.MultiValueField(null=True) + + # Crossmark ativo + crossmark_active = indexes.BooleanField(null=True) + def prepare(self, obj): """ " Here add the title to with dynamic fields. @@ -77,9 +96,9 @@ def prepare(self, obj): for collection in collections: for lang in obj.languages.all(): data["fulltext_pdf_%s" % (lang.code2)] = ( - "http://%s/scielo.php?script=sci_pdf&pid=%s&tlng=%s" + "%s/scielo.php?script=sci_pdf&pid=%s&tlng=%s" % ( - collection.domain, + collection.base_url, obj.pid_v2, lang.code2, ) @@ -91,9 +110,9 @@ def prepare(self, obj): for collection in collections: for lang in obj.languages.all(): data["fulltext_html_%s" % (lang.code2)] = ( - "http://%s/scielo.php?script=sci_arttext&pid=%s&tlng=%s" + "%s/scielo.php?script=sci_arttext&pid=%s&tlng=%s" % ( - collection.domain, + collection.base_url, obj.pid_v2, lang.code2, ) @@ -101,6 +120,34 @@ def prepare(self, obj): return data + def prepare_issn(self, obj): + if obj.journal and obj.journal.official: + issns = [] + if obj.journal.official.issn_electronic: + issns.append(obj.journal.official.issn_electronic) + if obj.journal.official.issn_print: + issns.append(obj.journal.official.issn_print) + if obj.journal.official.issnl: + issns.append(obj.journal.official.issnl) + return issns or None + + def prepare_license(self, obj): + if obj.license and obj.license.license_type: + return obj.license.license_type + + def prepare_open_access(self, obj): + if obj.journal: + return obj.journal.open_access + + def prepare_indexed_at(self, obj): + if obj.journal: + return [i.acronym for i in obj.journal.indexed_at.all() if i.acronym] + + def prepare_crossmark_active(self, obj): + if obj.journal: + return obj.journal.crossmark_doi_is_active + return False + def prepare_ids(self, obj): """ This field have all ids for the article. @@ -130,8 +177,8 @@ def prepare_ur(self, obj): if obj.journal: for collection in collections: urls.append( - "http://%s/scielo.php?script=sci_arttext&pid=%s" - % (collection.domain, obj.pid_v2) + "%s/scielo.php?script=sci_arttext&pid=%s" + % (collection.base_url, obj.pid_v2) ) return urls @@ -305,6 +352,39 @@ class ArticleOAIIndex(indexes.SearchIndex, indexes.Indexable): compile = indexes.CharField( null=True, index_fieldname="item.compile", use_template=True ) + # ISSNs — item.collections já tem o ISSN SciELO, mas falta o ISSN oficial + issn = indexes.MultiValueField(null=True, index_fieldname="metadata.dc.relation") + + # Publisher — sempre vazio nos dados reais + publisher = indexes.MultiValueField(null=True, index_fieldname="metadata.dc.publisher") + + # ORCID como campo pesquisável separado + orcid = indexes.MultiValueField(null=True, index_fieldname="metadata.dc.contributor.orcid") + + # Format — presente no item.compile mas não como campo Solr direto + format_ = indexes.CharField(null=True, index_fieldname="metadata.dc.format") + + def prepare_publisher(self, obj): + if obj.journal: + names = obj.journal.publisher_names + return names if names else None + + def prepare_issn(self, obj): + if obj.journal and obj.journal.official: + issns = [] + for attr in ("issn_electronic", "issn_print", "issnl"): + v = getattr(obj.journal.official, attr, None) + if v: + issns.append(v) + return issns or None + + def prepare_orcid(self, obj): + if obj.contrib_persons.exists(): + return [ + p.orcid + for p in obj.contrib_persons.all() + if p.orcid + ] or None def prepare_id(self, obj): """This field is the identifier of the record @@ -396,15 +476,7 @@ def prepare_dates(self, obj): """This the publication date, that is format by YYYY-MM-DD In the model this field is seperated into pub_date_day, pub_date_month and pub_date_year """ - return [ - "-".join( - [ - obj.pub_date_year or "", - obj.pub_date_month or "", - obj.pub_date_day or "", - ] - ), - ] + return obj.pub_date def prepare_la(self, obj): """The language of the article.""" @@ -426,9 +498,9 @@ def prepare_identifier(self, obj): for collection in collections: for lang in obj.languages.all(): idents.add( - "http://%s/scielo.php?script=sci_arttext&pid=%s&tlng=%s" + "%s/scielo.php?script=sci_arttext&pid=%s&tlng=%s" % ( - collection.domain, + collection.base_url, obj.pid_v2, lang.code2, ) From 9cfa0463487c1e92e5f663339308dfa1e5242fa2 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:27:50 -0300 Subject: [PATCH 14/27] =?UTF-8?q?article:=20adiciona=20ArticleIteratorBuil?= =?UTF-8?q?der=20ao=20controller=20para=20sele=C3=A7=C3=A3o=20unificada=20?= =?UTF-8?q?de=20artigos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduz a classe ArticleIteratorBuilder que encadeia até quatro iteradores independentes de seleção de artigos: - _iter_from_pid_provider: itera PidProviderXML filtrados por ISSN, intervalo de ano de publicação, datas e proc_status_list; usa Journal.get_journal_issns para agrupar por periódico. - _iter_from_article: itera Article por data_status; tenta recuperar pp_xml via get_by_pid_v3 quando ausente; yields None para artigos sem pp_xml recuperável (sinaliza skip no dispatcher). - _iter_from_harvest: coleta documentos via OPACHarvester (Brasil/scl) ou AMHarvester (demais coleções); carrega coleções se banco vazio; yields dict com xml_url, collection_acron, pid e source_date. - _iter_from_article_source: itera ArticleSource via get_queryset_to_complete_data com filtros de data, force_update e article_source_status_list. Todos os iteradores são encadeados em __iter__ via yield from, permitindo múltiplas fontes ativas simultaneamente na mesma instância. Imports removidos: load_article, date_utils, SciELOJournal, XMLVersionXmlWithPreError, PPXML_STATUS_DUPLICATED/DEDUPLICATED, DATA_STATUS_DUPLICATED/DEDUPLICATED/PUBLIC, Q, datetime. Imports adicionados: itertools (reserva), AMHarvester, OPACHarvester, Collection, ArticleSource, choices (módulo). --- article/controller.py | 198 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 184 insertions(+), 14 deletions(-) diff --git a/article/controller.py b/article/controller.py index 96ac11ab7..becd033a9 100644 --- a/article/controller.py +++ b/article/controller.py @@ -1,31 +1,24 @@ import csv +import itertools import json import logging import sys import traceback -from datetime import datetime -from django.db.models import Q from packtools.sps.formats.am import am -from article.sources.xmlsps import load_article -from article.models import Article, ArticleExporter, ArticleFunding -from article.choices import ( - DATA_STATUS_DUPLICATED, - DATA_STATUS_DEDUPLICATED, - DATA_STATUS_PUBLIC, -) +from article.models import Article, ArticleExporter, ArticleFunding, ArticleSource +from article import choices +from collection.models import Collection from core.mongodb import write_item -from core.utils import date_utils +from core.utils.harvesters import AMHarvester, OPACHarvester from institution.models import Sponsor -from journal.models import Journal, SciELOJournal +from journal.models import Journal from pid_provider.choices import ( PPXML_STATUS_TODO, - PPXML_STATUS_DUPLICATED, - PPXML_STATUS_DEDUPLICATED, PPXML_STATUS_INVALID, ) -from pid_provider.models import PidProviderXML, XMLVersionXmlWithPreError +from pid_provider.models import PidProviderXML from tracker.models import UnexpectedEvent @@ -403,3 +396,180 @@ def bulk_export_articles_to_articlemeta( }, ) raise + + +class ArticleIteratorBuilder: + """ + Monta e encadeia iteradores de seleção de artigos para despacho ao pipeline. + + Cada método ``_iter_from_*`` é um gerador que yields kwargs prontos para + ``task_process_article_pipeline``. Os iteradores ativos são determinados + pelos argumentos exclusivos presentes na instância — múltiplos podem estar + ativos simultaneamente. + + Argumentos exclusivos e seus iteradores: + + ========================= ================================================ + Argumento exclusivo Iterador ativado + ========================= ================================================ + proc_status_list _iter_from_pid_provider + data_status_list _iter_from_article + limit / timeout / opac_url _iter_from_harvest + article_source_status_list _iter_from_article_source + (nenhum) _iter_from_pid_provider (padrão) + ========================= ================================================ + + Usage:: + + it = ArticleIteratorBuilder( + user=user, + collection_acron_list=["scl"], + proc_status_list=["todo"], + data_status_list=["invalid"], + ) + for kwargs in it: + task_process_article_pipeline.delay(**kwargs) + """ + + def __init__( + self, + user, + collection_acron_list=None, + journal_acron_list=None, + from_pub_year=None, + until_pub_year=None, + from_date=None, + until_date=None, + proc_status_list=None, + data_status_list=None, + article_source_status_list=None, + limit=None, + timeout=None, + opac_url=None, + force_update=None, + ): + self.user = user + self.collection_acron_list = collection_acron_list + self.journal_acron_list = journal_acron_list + self.from_pub_year = from_pub_year + self.until_pub_year = until_pub_year + self.from_date = from_date + self.until_date = until_date + self.proc_status_list = proc_status_list + self.data_status_list = data_status_list + self.article_source_status_list = article_source_status_list + self.limit = limit + self.timeout = timeout + self.opac_url = opac_url + self.force_update = force_update + + def __iter__(self): + yield from self._iter_from_harvest() + yield from self._iter_from_article_source() + yield from self._iter_from_pid_provider() + yield from self._iter_from_article() + + # ------------------------------------------------------------------ + # Iteradores de seleção + # ------------------------------------------------------------------ + + def _iter_from_pid_provider(self): + """Itera PidProviderXML filtrados por periódico, data e status.""" + journal_issn_groups = ( + Journal.get_journal_issns(self.collection_acron_list, self.journal_acron_list) + or [None] + ) + for journal_issns in journal_issn_groups: + issn_list = [i for i in journal_issns if i] if journal_issns else None + if journal_issns and not issn_list: + continue + qs = PidProviderXML.get_queryset( + issn_list=issn_list, + from_pub_year=self.from_pub_year, + until_pub_year=self.until_pub_year, + from_updated_date=self.from_date, + until_updated_date=self.until_date, + proc_status_list=self.proc_status_list or [PPXML_STATUS_TODO, PPXML_STATUS_INVALID], + ) + for item in qs.iterator(): + yield {"pp_xml_id": item.id} + + def _iter_from_article(self): + """ + Itera Articles filtrados por data_status. + Yields None para artigos sem pp_xml recuperável (sinaliza skip). + """ + filters = { + "data_status__in": self.data_status_list or [ + choices.DATA_STATUS_PENDING, + choices.DATA_STATUS_UNDEF, + choices.DATA_STATUS_INVALID, + ] + } + journal_id_list = Journal.get_ids( + collection_acron_list=self.collection_acron_list, + journal_acron_list=self.journal_acron_list, + ) + if journal_id_list: + filters["journal__in"] = journal_id_list + if self.from_pub_year: + filters["pub_year__gte"] = self.from_pub_year + if self.until_pub_year: + filters["pub_year__lte"] = self.until_pub_year + if self.from_date: + filters["updated__gte"] = self.from_date + if self.until_date: + filters["updated__lte"] = self.until_date + + for article in Article.objects.filter(**filters).iterator(): + if not article.pp_xml: + try: + article.pp_xml = PidProviderXML.get_by_pid_v3(pid_v3=article.pid_v3) + article.save(update_fields=["pp_xml"]) + except Exception as e: + logging.error(f"pp_xml not found for article {article.id}: {e}") + yield None + continue + yield {"pp_xml_id": article.pp_xml.id} + + def _iter_from_harvest(self): + """Itera documentos coletados via OPAC ou ArticleMeta.""" + if Collection.objects.count() == 0: + Collection.load(self.user) + + for collection_acron in self.collection_acron_list or list(Collection.get_acronyms()): + harvester = self._build_harvester(collection_acron) + for document in harvester.harvest_documents(): + yield { + "xml_url": document["url"], + "collection_acron": collection_acron, + "pid": document["pid_v2"], + "source_date": document.get("processing_date") or document.get("origin_date"), + } + + def _iter_from_article_source(self): + """Itera ArticleSources pendentes ou com erro.""" + for article_source in ArticleSource.get_queryset_to_complete_data( + self.from_date, + self.until_date, + self.force_update, + self.article_source_status_list, + ): + yield {"article_source_id": article_source.id} + + # ------------------------------------------------------------------ + # Helpers privados + # ------------------------------------------------------------------ + + def _build_harvester(self, collection_acron): + """Instancia o harvester adequado para a coleção.""" + kwargs = dict( + from_date=self.from_date, + until_date=self.until_date, + limit=self.limit, + timeout=self.timeout, + ) + if collection_acron == "scl": + return OPACHarvester(self.opac_url or "www.scielo.br", collection_acron, **kwargs) + return AMHarvester("article", collection_acron, **kwargs) + From c1f49f7a346da347a168bed29d6ec400b79543e3 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:27:50 -0300 Subject: [PATCH 15/27] article: consolida tasks em task_dispatch_articles + task_process_article_pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tasks removidas (substituídas): - task_select_articles_to_complete_data - task_select_articles_to_load_from_api - task_select_articles_to_load_from_collection_endpoint - task_load_article_from_xml_url - task_load_article_from_pp_xml - task_select_articles_to_load_from_article_source - task_load_articles - task_load_journal_articles - task_fix_journal_articles_status (lógica absorvida por task_fix_article_status) Tasks novas: - task_dispatch_articles: orquestradora que instancia ArticleIteratorBuilder com todos os filtros disponíveis (collection, journal, ano, data, proc_status_list, data_status_list, article_source_status_list, limit, timeout, opac_url) e dispara task_process_article_pipeline.delay para cada item; contabiliza dispatched/skipped; registra UnexpectedEvent em caso de erro. - task_process_article_pipeline: pipeline com três pontos de entrada: Fluxo A (xml_url + collection_acron + pid → AMArticle → ArticleSource → add_pid_provider → pp_xml_id), Fluxo B (article_source_id → add_pid_provider → pp_xml_id), Fluxo C (pp_xml_id direto). Após obter pp_xml, chama load_article e atualiza pp_xml.collections. Se export_to_articlemeta=True, verifica disponibilidade e dispara task_export_article_to_articlemeta. task_fix_article_status: absorve a lógica de task_fix_journal_articles_status, iterando diretamente sobre journal_id_list (derivada de collection/journal ou de journal_id direto) sem subtarefas; aceita journal_id direto como atalho. task_check_article_availability: sem alteração funcional; docstring corrigida (removia docstring de outra task). Docstrings adicionadas/corrigidas em: load_funding_data, load_preprint, task_convert_xml_to_other_formats_for_articles, convert_xml_to_other_formats, transfer_license_statements_fk_to_article_license, get_researcher_identifier_unnormalized, normalize_stored_email, task_export_articles_to_articlemeta, task_export_article_to_articlemeta. Imports removidos: traceback, datetime/timedelta, group, transaction, Count/F/Prefetch/Q/Subquery, choices, fetch_data, AMHarvester, OPACHarvester, SciELOJournal, PPXML_STATUS_DONE/TODO/INVALID, PidProvider. --- article/tasks.py | 1519 +++++++++++++++++----------------------------- 1 file changed, 572 insertions(+), 947 deletions(-) diff --git a/article/tasks.py b/article/tasks.py index c074a6506..3a7550e70 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -1,29 +1,20 @@ import logging import sys -import traceback -from datetime import datetime, timedelta -from celery import group, shared_task from django.contrib.auth import get_user_model -from django.db import transaction -from django.db.models import Count, F, Prefetch, Q, Subquery from django.utils.translation import gettext_lazy as _ from article import controller from article.models import Article, ArticleFormat, ArticleSource, AMArticle from article.sources.preprint import harvest_preprints from article.sources.xmlsps import load_article -from article import choices from collection.models import Collection from config import celery_app from core.models import License from core.utils.extracts_normalized_email import extracts_normalized_email -from core.utils.utils import _get_user, fetch_data -from core.utils.harvesters import AMHarvester, OPACHarvester -from journal.models import SciELOJournal, Journal -from pid_provider.choices import PPXML_STATUS_DONE, PPXML_STATUS_TODO, PPXML_STATUS_INVALID +from core.utils.utils import _get_user +from journal.models import Journal from pid_provider.models import PidProviderXML -from pid_provider.provider import PidProvider from researcher.models import ResearcherIdentifier from tracker.models import UnexpectedEvent @@ -32,13 +23,70 @@ @celery_app.task() def load_funding_data(user, file_path): + """ + Carrega dados de financiamento a partir de um arquivo CSV ou similar. + + Processa um arquivo de dados de financiamento e carrega as informações + no banco de dados, associando-as aos artigos correspondentes. + + Args: + user (int): ID do usuário que está executando a tarefa + file_path (str): Caminho absoluto para o arquivo contendo dados de financiamento + + Returns: + None + + Side Effects: + - Lê arquivo de financiamento do sistema de arquivos + - Cria/atualiza registros de financiamento no banco + - Registra logs de processamento e erros + + Notes: + - Utiliza controller.read_file para processamento + - O formato do arquivo deve seguir o padrão esperado pelo sistema + """ user = User.objects.get(pk=user) controller.read_file(user, file_path) -@celery_app.task(bind=True, name=_("load_preprints")) +@celery_app.task(bind=True, name=_('load_preprints')) def load_preprint(self, user_id, oai_pmh_preprint_uri): + """ + Coleta e carrega preprints de um endpoint OAI-PMH específico. + + Conecta-se a um servidor OAI-PMH para coletar metadados de preprints + e carregá-los no sistema para posterior processamento. + + Args: + self: Instância da tarefa Celery + user_id (int): ID do usuário executando a tarefa (obrigatório) + oai_pmh_preprint_uri (str): URI do endpoint OAI-PMH para coleta (obrigatório) + + Returns: + None + + Side Effects: + - Conecta ao endpoint OAI-PMH especificado + - Coleta metadados de preprints disponíveis + - Cria/atualiza registros de preprints no banco + - Registra logs de processamento e eventuais erros + + Todo: + - Implementar filtro para não coletar todos os registros sempre + - Adicionar suporte a coleta incremental por data + + Examples: + # Coletar preprints de repositório específico + load_preprint.delay( + user_id=1, + oai_pmh_preprint_uri="http://repo.example.com/oai/request" + ) + + Notes: + - Utiliza harvest_preprints para o processamento efetivo + - A coleta completa pode ser demorada em repositórios grandes + """ user = User.objects.get(pk=user_id) ## fazer filtro para não coletar tudo sempre harvest_preprints(oai_pmh_preprint_uri, user) @@ -48,6 +96,34 @@ def load_preprint(self, user_id, oai_pmh_preprint_uri): def task_convert_xml_to_other_formats_for_articles( self, user_id=None, username=None, from_date=None, force_update=False ): + """ + Dispara conversão de XML para outros formatos para todos os artigos com SPS package. + + Itera por todos os artigos que possuem sps_pkg_name e dispara + tarefas individuais de conversão para cada um. + + Args: + self: Instância da tarefa Celery + user_id (int, optional): ID do usuário executando a tarefa + username (str, optional): Nome do usuário executando a tarefa + from_date (str, optional): Data inicial para filtrar artigos (não implementado) + force_update (bool, optional): Força reprocessamento mesmo se já convertido + + Returns: + None + + Side Effects: + - Dispara múltiplas subtarefas convert_xml_to_other_formats + - Registra UnexpectedEvent em caso de erro + - Processa todos os artigos com sps_pkg_name + + Examples: + # Converter todos os artigos + task_convert_xml_to_other_formats_for_articles.delay( + user_id=1, + force_update=True + ) + """ try: user = _get_user(self.request, username, user_id) @@ -87,6 +163,31 @@ def task_convert_xml_to_other_formats_for_articles( def convert_xml_to_other_formats( self, user_id=None, username=None, item_id=None, force_update=None ): + """ + Converte XML de um artigo específico para outros formatos (HTML, PDF, etc.). + + Verifica se o artigo já possui formatos gerados e, caso necessário, + gera os formatos a partir do XML SPS armazenado. + + Args: + self: Instância da tarefa Celery + user_id (int, optional): ID do usuário executando a tarefa + username (str, optional): Nome do usuário executando a tarefa + item_id (int): ID do artigo a ser processado (obrigatório) + force_update (bool, optional): Força regeneração mesmo se já existe + + Returns: + None + + Side Effects: + - Cria/atualiza registros ArticleFormat + - Gera arquivos HTML, PDF e outros formatos + - Registra logs de processamento + + Notes: + - Pula processamento se ArticleFormat já existe e force_update=False + - Utiliza ArticleFormat.generate_formats para conversão + """ user = _get_user(self.request, username, user_id) try: @@ -110,118 +211,34 @@ def convert_xml_to_other_formats( @celery_app.task(bind=True) -def task_select_articles_to_complete_data( - self, - username=None, - user_id=None, - collection_acron_list=None, - journal_acron_list=None, - data_status_list=None, - from_pub_year=None, - until_pub_year=None, - from_updated_date=None, - until_updated_date=None, - articlemeta_export_enable=False, +def transfer_license_statements_fk_to_article_license( + self, user_id=None, username=None ): """ - Task para carregar artigos de uma lista selecionada de periódicos. - Dispara subtasks para cada periódico encontrado. - """ - try: - user = _get_user(self.request, username=username, user_id=user_id) - - # Construir filtros para os artigos - article_filters = {} - - # Obter IDs dos periódicos baseado nos filtros - journal_id_list = Journal.get_ids( - collection_acron_list=collection_acron_list, - journal_acron_list=journal_acron_list, - ) - - if journal_id_list: - article_filters["journal__in"] = journal_id_list - - # Aplicar filtro de status se fornecido - if not data_status_list: - data_status_list = [ - choices.DATA_STATUS_PENDING, - choices.DATA_STATUS_UNDEF, - choices.DATA_STATUS_INVALID, - ] - article_filters["data_status__in"] = data_status_list - - # Adicionar filtros de data se fornecidos - if from_pub_year: - article_filters["pub_year__gte"] = from_pub_year - if until_pub_year: - article_filters["pub_year__lte"] = until_pub_year - if from_updated_date: - article_filters["updated__gte"] = from_updated_date - if until_updated_date: - article_filters["updated__lte"] = until_updated_date - - # Processar artigos - articles_processed = 0 - articles_skipped = 0 - - for article in Article.objects.filter(**article_filters).iterator(): - if not article.pp_xml_id: - try: - pp_xml = PidProviderXML.objects.get(v3=article.pid_v3) - article.pp_xml = pp_xml - article.save(update_fields=['pp_xml']) - except PidProviderXML.DoesNotExist: - articles_skipped += 1 - continue - - task_load_article_from_pp_xml.delay( - pp_xml_id=article.pp_xml_id, - pid_v3=article.pid_v3, - user_id=user_id or user.id, - username=username or user.username, - articlemeta_export_enable=articlemeta_export_enable, - ) - articles_processed += 1 - - return { - "status": "success", - "message": "Complete data to articles", - "articles_processed": articles_processed, - "articles_skipped": articles_skipped, - "filters": { - "collection_acron_list": collection_acron_list, - "journal_acron_list": journal_acron_list, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "data_status_list": data_status_list, - }, - } - except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - detail={ - "task": "task_select_articles_to_complete_data", - "collection_acron_list": collection_acron_list, - "journal_acron_list": journal_acron_list, - "data_status_list": data_status_list, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - }, - ) - raise + Migra informações de licença de license_statements para o campo license. + Processa artigos que não possuem license mas têm license_statements, + transferindo as informações para o campo direto license. -@celery_app.task(bind=True) -def transfer_license_statements_fk_to_article_license( - self, user_id=None, username=None -): + Args: + self: Instância da tarefa Celery + user_id (int, optional): ID do usuário executando a tarefa + username (str, optional): Nome do usuário executando a tarefa + + Returns: + None + + Side Effects: + - Atualiza campo license em artigos + - Cria registros License se necessário + - Executa bulk_update para otimizar performance + - Registra logs de processamento + + Notes: + - Processa apenas artigos com license=None + - Usa o primeiro license_statement como referência + - Cria License automaticamente se não existir + """ user = _get_user(self.request, username, user_id) articles_to_update = [] for instance in Article.objects.filter(license__isnull=True): @@ -247,6 +264,20 @@ def transfer_license_statements_fk_to_article_license( def get_researcher_identifier_unnormalized(): + """ + Retorna identificadores de e-mail que não seguem formato padrão RFC 5322. + + Filtra objetos ResearcherIdentifier que possuem source_name="EMAIL" + mas cujo campo identifier não corresponde ao padrão de e-mail válido. + + Returns: + QuerySet: Queryset de ResearcherIdentifier com e-mails mal formatados + + Notes: + - Usa regex para identificar e-mails fora do padrão + - Utilizada pela tarefa normalize_stored_email para identificar registros a corrigir + - Regex verifica formato básico: usuario@dominio.extensao + """ return ResearcherIdentifier.objects.filter(source_name="EMAIL").exclude( identifier__regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" ) @@ -256,6 +287,38 @@ def get_researcher_identifier_unnormalized(): def normalize_stored_email( self, ): + """ + Normaliza e corrige endereços de e-mail mal formatados no banco de dados. + + Busca identificadores de pesquisadores do tipo EMAIL que não seguem + o padrão RFC 5322 e aplica normalização para corrigir formatos inválidos. + + Args: + self: Instância da tarefa Celery + + Returns: + None + + Side Effects: + - Identifica e-mails com formato inválido usando regex + - Aplica normalização através de extracts_normalized_email + - Executa bulk_update para otimizar performance em lotes + - Registra logs de processamento + + Examples: + # Executar normalização de e-mails + normalize_stored_email.delay() + + Notes: + - Processa apenas ResearcherIdentifier com source_name="EMAIL" + - Usa regex para identificar e-mails que não seguem formato padrão + - Operação é idempotente - pode ser executada múltiplas vezes + - Performance otimizada com bulk_update para grandes volumes + + See Also: + - get_researcher_identifier_unnormalized(): Função auxiliar para filtros + - extracts_normalized_email(): Função de normalização de e-mails + """ updated_list = [] re_identifiers = get_researcher_identifier_unnormalized() @@ -284,20 +347,51 @@ def task_export_articles_to_articlemeta( username=None, ): """ - Export articles to ArticleMeta Database with flexible filtering. - + Exporta artigos em lote para a base de dados ArticleMeta com filtros flexíveis. + + Processa e exporta múltiplos artigos para o sistema ArticleMeta baseado + em critérios de filtragem por coleção, periódico, ano ou data. + Args: - collection_acron_list: List of collection acronyms - journal_acron_list: List of journal acronyms - year_of_publication: Specific year of publication - from_pub_year: Start publication year - until_pub_year: End publication year - from_date: Start date for filtering - until_date: End date for filtering - days_to_go_back: Number of days to go back - force_update: Force update existing records - user_id: User ID for authentication - username: Username for authentication + self: Instância da tarefa Celery + collection_acron_list (list, optional): Lista de acrônimos de coleções + journal_acron_list (list, optional): Lista de acrônimos de periódicos + year_of_publication (int, optional): Ano específico de publicação + from_pub_year (int, optional): Ano inicial para filtro de publicação + until_pub_year (int, optional): Ano final para filtro de publicação + from_date (str, optional): Data inicial para filtro (formato ISO) + until_date (str, optional): Data final para filtro (formato ISO) + days_to_go_back (int, optional): Número de dias para retroceder da data atual + force_update (bool, optional): Força reprocessamento mesmo se já exportado + user_id (int, optional): ID do usuário executando a tarefa + username (str, optional): Nome do usuário executando a tarefa + + Returns: + dict: Resultado da operação com estatísticas de processamento + + Side Effects: + - Exporta múltiplos artigos para ArticleMeta + - Atualiza status de exportação dos artigos + - Registra logs de processamento + - Registra UnexpectedEvent em caso de erro + + Examples: + # Exportar por coleção e período + task_export_articles_to_articlemeta.delay( + collection_acron_list=["scl", "arg"], + from_pub_year=2023, + until_pub_year=2024 + ) + + # Exportar artigos dos últimos 7 dias + task_export_articles_to_articlemeta.delay( + days_to_go_back=7, + force_update=True + ) + + Notes: + - Utiliza controller.bulk_export_articles_to_articlemeta internamente + - Pode processar grandes volumes de dados """ try: user = _get_user(self.request, username=username, user_id=user_id) @@ -353,16 +447,42 @@ def task_export_article_to_articlemeta( username=None, ): """ - Export a single article to ArticleMeta Database. + Exporta um artigo específico para a base de dados ArticleMeta. + + Processa e exporta um único artigo identificado pelo PID v3 + para o sistema ArticleMeta, com controle de atualizações forçadas. Args: - pid_v3: Article PID v3 - force_update: Force update existing records - user_id: User ID - username: Username + self: Instância da tarefa Celery + pid_v3 (str, optional): PID v3 do artigo a exportar (obrigatório) + collection_acron_list (list, optional): Lista de acrônimos de coleções para filtro + force_update (bool): Força reexportação mesmo se já exportado + user_id (int, optional): ID do usuário executando a tarefa + username (str, optional): Nome do usuário executando a tarefa Returns: - bool: True if export was successful, False otherwise. + bool: True se exportação foi bem-sucedida, False caso contrário + + Side Effects: + - Exporta artigo específico para ArticleMeta + - Atualiza status de exportação do artigo + - Registra logs de processamento + - Registra UnexpectedEvent em caso de erro + + Raises: + ValueError: Se pid_v3 não for fornecido + Article.DoesNotExist: Se artigo com o PID não for encontrado + + Examples: + # Exportar artigo específico + task_export_article_to_articlemeta.delay( + pid_v3="S1234-56782024000100001", + force_update=True + ) + + Notes: + - Utiliza controller.export_article_to_articlemeta internamente + - Requer que o artigo exista na base local antes da exportação """ try: if not pid_v3: @@ -393,185 +513,106 @@ def task_export_article_to_articlemeta( @celery_app.task(bind=True) -def task_load_article_from_pp_xml( +def task_fix_article_status( self, - pp_xml_id=None, - pid_v3=None, - user_id=None, username=None, + user_id=None, collection_acron_list=None, - articlemeta_export_enable=None, - force_update=None, - timeout=None, - is_activate=None, - version=None, + journal_acron_list=None, + journal_id=None, + mark_as_invalid=False, + mark_as_public=False, + mark_as_duplicated=False, + deduplicate=False, ): """ - Carrega um artigo específico a partir de um PidProviderXML. + Marca artigos com diferentes status baseado em filtros de coleções e periódicos. - Processa o XML armazenado no PidProviderXML, cria/atualiza o Article - e opcionalmente exporta para ArticleMeta. + Aceita filtros por lista de coleções/periódicos ou um journal_id direto. + Itera pelos periódicos correspondentes e aplica as operações de marcação. Args: self: Instância da tarefa Celery - pp_xml_id (int): ID do PidProviderXML a processar (obrigatório) - user_id (int, optional): ID do usuário executando a tarefa username (str, optional): Nome do usuário executando a tarefa - articlemeta_export_enable (bool, optional): Exporta para ArticleMeta após carregar + user_id (int, optional): ID do usuário executando a tarefa + collection_acron_list (list, optional): Lista de acrônimos de coleções + journal_acron_list (list, optional): Lista de acrônimos de periódicos + journal_id (int, optional): ID direto de um periódico específico + mark_as_invalid (bool): Se True, marca artigos como invalid + mark_as_public (bool): Se True, marca artigos como public + mark_as_duplicated (bool): Se True, marca artigos como duplicated + deduplicate (bool): Se True, marca artigos como deduplicated Returns: - None + dict: Resumo da operação com contadores Side Effects: - - Cria/atualiza Article no banco - - Atualiza status do PidProviderXML para DONE - - Verifica disponibilidade do artigo - - Exporta para ArticleMeta se solicitado + - Altera status de artigos no banco - Registra UnexpectedEvent em caso de erro - Notes: - - O XML é lido diretamente do arquivo armazenado no PidProviderXML - - A verificação de disponibilidade valida URLs e assets do artigo + Examples: + task_fix_article_status.delay( + collection_acron_list=["scl"], + mark_as_invalid=True, + mark_as_public=True, + ) + + task_fix_article_status.delay( + journal_id=42, + deduplicate=True, + ) """ try: - user = _get_user(self.request, username, user_id) + user = _get_user(self.request, username=username, user_id=user_id) - pp_xml = None - # Busca o PidProviderXML com suas relações - if pp_xml_id: - pp_xml = PidProviderXML.objects.select_related("current_version").get( - id=pp_xml_id - ) + operations = { + "invalid": mark_as_invalid, + "public": mark_as_public, + "duplicated": mark_as_duplicated, + "deduplicated": deduplicate, + } - # Carrega o artigo do arquivo XML - article = load_article( - user, - v3=pid_v3, - pp_xml=pp_xml, - ) - pp_xml.collections.set(article.collections) + if not any(operations.values()): + raise ValueError("At least one marking operation must be specified") - # Exporta para ArticleMeta se solicitado - if articlemeta_export_enable: - # Verifica disponibilidade (URLs, assets, etc) - article.check_availability(user) - controller.export_article_to_articlemeta( - user, - article, - collection_acron_list, - force_update, - version=version, - ) + # Determinar lista de journal_ids a processar + if journal_id: + journal_id_list = [journal_id] else: - if article.is_available(): - return - task_check_article_availability.delay( - article_id=article.id, - user_id=user.id, - username=user.username, - collection_acron_list=collection_acron_list, - timeout=timeout, - is_activate=is_activate, - force_update=force_update, - ) + journal_id_list = Journal.get_ids(collection_acron_list, journal_acron_list) - except Exception as exception: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=exception, - exc_traceback=exc_traceback, - detail={ - "task": "article.tasks.task_load_article_from_pp_xml", - "pp_xml_id": pp_xml_id, - "articlemeta_export_enable": articlemeta_export_enable, - "force_update": force_update, - }, - ) + journals_processed = 0 + for jid in journal_id_list: + if Article.objects.filter(journal_id=jid).count() == 0: + continue -@celery_app.task(bind=True) -def task_select_articles_to_load_from_api( - self, - username=None, - user_id=None, - collection_acron_list=None, - from_date=None, - until_date=None, - limit=None, - timeout=None, - force_update=None, - auto_solve_pid_conflict=None, - opac_url=None, -): - """ - Tarefa orquestradora para carregar artigos de múltiplas coleções via API. - - Dispara tarefas paralelas para cada coleção, otimizando o processamento - em larga escala. Se nenhuma coleção for especificada, processa todas as - coleções conhecidas do SciELO. - - Args: - self: Instância da tarefa Celery - username (str, optional): Nome do usuário executando a tarefa - user_id (int, optional): ID do usuário executando a tarefa - collection_acron_list (list, optional): Lista de acrônimos das coleções. - Se None, usa lista padrão com todas as coleções SciELO. - Ex: ["scl", "arg", "mex", "esp"] - from_date (str, optional): Data inicial para coleta (formato ISO) - until_date (str, optional): Data final para coleta (formato ISO) - limit (int, optional): Limite de artigos por coleção - timeout (int, optional): Timeout em segundos para requisições HTTP - force_update (bool, optional): Força atualização mesmo se já existe - auto_solve_pid_conflict (bool, optional): Resolve conflitos de PID automaticamente - - Returns: - None + if mark_as_invalid: + Article.mark_items_as_invalid(journal_id=jid) - Side Effects: - - Garante que coleções estão carregadas no banco - - Dispara uma tarefa para cada coleção em collection_acron_list - - Registra UnexpectedEvent em caso de erro + if mark_as_public: + Article.mark_items_as_public(journal_id=jid) - Examples: - # Carregar artigos de coleções específicas - task_select_articles_to_load_from_api.delay( - collection_acron_list=["scl", "mex"], - from_date="2024-01-01", - until_date="2024-12-31" - ) + if mark_as_duplicated or deduplicate: + Article.deduplicate_items( + user, + journal_id=jid, + mark_as_duplicated=mark_as_duplicated, + deduplicate=deduplicate, + ) - # Carregar artigos de todas as coleções com limite - task_select_articles_to_load_from_api.delay( - limit=100, - force_update=True - ) - """ - try: - user = _get_user(self.request, username=username, user_id=user_id) + journals_processed += 1 - # Define coleções padrão se não especificadas - # Garante que as coleções estão carregadas no banco - if Collection.objects.count() == 0: - Collection.load(user) - - if not collection_acron_list: - collection_acron_list = Collection.get_acronyms() - # Dispara tarefa para cada coleção - for collection_acron in collection_acron_list: - task_select_articles_to_load_from_collection_endpoint.apply_async( - kwargs={ - "username": username, - "user_id": user_id, - "collection_acron": collection_acron, - "from_date": from_date, - "until_date": until_date, - "limit": limit, - "timeout": timeout, - "force_update": force_update, - "auto_solve_pid_conflict": auto_solve_pid_conflict, - "opac_url": opac_url, - } - ) + return { + "status": "success", + "journals_processed": journals_processed, + "operations": {k: v for k, v in operations.items() if v}, + "filters": { + "collections": collection_acron_list, + "journals": journal_acron_list, + "journal_id": journal_id, + }, + } except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() @@ -579,394 +620,193 @@ def task_select_articles_to_load_from_api( exception=e, exc_traceback=exc_traceback, detail={ - "task": "task_select_articles_to_load_from_api", + "task": "task_fix_article_status", "collection_acron_list": collection_acron_list, - "from_date": from_date, - "until_date": until_date, - "limit": limit, - "timeout": timeout, + "journal_acron_list": journal_acron_list, + "journal_id": journal_id, + "operations": { + "mark_as_invalid": mark_as_invalid, + "mark_as_public": mark_as_public, + "mark_as_duplicated": mark_as_duplicated, + "deduplicate": deduplicate, + }, }, ) + raise @celery_app.task(bind=True) -def task_select_articles_to_load_from_collection_endpoint( +def task_check_article_availability( self, - username=None, user_id=None, - collection_acron=None, - from_date=None, - until_date=None, - limit=None, + username=None, + article_id=None, + collection_acron_list=None, timeout=None, - force_update=None, - auto_solve_pid_conflict=None, - opac_url=None, + is_activate=None, + force_update=False, ): """ - Coleta artigos de uma coleção específica via endpoint OPAC ou ArticleMeta. + Verifica e atualiza o status de disponibilidade de um artigo específico. - Utiliza harvesters especializados para cada tipo de endpoint: - - OPACHarvester: Para coleção Brasil (scl) - - AMHarvester: Para demais coleções via ArticleMeta + Executa verificações de URLs, assets e outros recursos do artigo + para determinar se está completamente disponível online. Args: self: Instância da tarefa Celery - username (str, optional): Nome do usuário executando a tarefa user_id (int, optional): ID do usuário executando a tarefa - collection_acron (str): Acrônimo da coleção (obrigatório). - Ex: "scl", "mex", "arg" - from_date (str, optional): Data inicial para coleta (formato ISO) - until_date (str, optional): Data final para coleta (formato ISO) - limit (int, optional): Limite de documentos a coletar - timeout (int, optional): Timeout em segundos para requisições - force_update (bool, optional): Força atualização de artigos existentes - auto_solve_pid_conflict (bool, optional): Resolve conflitos de PID - - Returns: - None - - Raises: - ValueError: Se collection_acron não for fornecido - - Side Effects: - - Dispara task_load_article_from_xml_url para cada documento - - Registra UnexpectedEvent em caso de erro - - Notes: - - OPAC é usado apenas para Brasil (scl) por questões de performance - - ArticleMeta é usado para todas as outras coleções - """ - try: - if not collection_acron: - raise ValueError("Missing collection_acron") - - # Seleciona o harvester apropriado baseado na coleção - if collection_acron == "scl": - harvester = OPACHarvester( - opac_url or "www.scielo.br", - collection_acron, - from_date=from_date, - until_date=until_date, - limit=limit, - timeout=timeout, - ) - else: - harvester = AMHarvester( - "article", - collection_acron, - from_date=from_date, - until_date=until_date, - limit=limit, - timeout=timeout, - ) - - # Itera sobre documentos e dispara tarefas individuais - for document in harvester.harvest_documents(): - source_date = document.get("processing_date") or document.get("origin_date") - task_load_article_from_xml_url.delay( - username, - user_id, - collection_acron, - document["pid_v2"], - document["url"], - source_date, - force_update, - auto_solve_pid_conflict, - ) - - except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - detail={ - "task": "task_select_articles_to_load_from_collection_endpoint", - "collection_acron": collection_acron, - "from_date": from_date, - "until_date": until_date, - "limit": limit, - "timeout": timeout, - "force_update": force_update, - }, - ) - - -@celery_app.task(bind=True) -def task_load_article_from_xml_url( - self, - username=None, - user_id=None, - collection_acron=None, - pid=None, - xml_url=None, - source_date=None, - force_update=None, - auto_solve_pid_conflict=None, -): - """ - Carrega um artigo individual a partir de uma URL de XML. - - Cria ou atualiza um ArticleSource e processa o XML para criar/atualizar - o artigo no banco de dados. - - Args: - self: Instância da tarefa Celery username (str, optional): Nome do usuário executando a tarefa - user_id (int, optional): ID do usuário executando a tarefa - xml_url (str): URL do XML do artigo - Ex: "https://www.scielo.br/scielo.php?script=sci_arttext&pid=..." - source_date (str, optional): Data de última atualização na fonte - force_update (bool, optional): Força reprocessamento mesmo se já completado - auto_solve_pid_conflict (bool, optional): Resolve conflitos de PID automaticamente + article_id (int, optional): ID do artigo a verificar (obrigatório) + collection_acron_list (list, optional): Lista de acrônimos de coleções para filtro + timeout (int, optional): Timeout em segundos para verificações HTTP + is_activate (bool, optional): Se deve ativar artigo após verificação + force_update (bool): Força nova verificação mesmo se recente Returns: None Side Effects: - - Cria/atualiza registro ArticleSource - - Processa XML e cria/atualiza Article + - Atualiza status de disponibilidade do artigo + - Verifica URLs de assets (PDF, HTML, etc.) + - Registra timestamps de última verificação - Registra UnexpectedEvent em caso de erro Notes: - - Pula processamento se ArticleSource já está COMPLETED e force_update=False - - XML é baixado e armazenado localmente antes do processamento + - Utiliza Article.check_availability para executar verificações + - Pode ser utilizada para monitoramento de saúde dos artigos """ try: - user = _get_user(self.request, username=username, user_id=user_id) - - # Cria ou atualiza ArticleSource - am_article = AMArticle.create_or_update( - pid, Collection.get(collection_acron), None, user - ) - - article_source = ArticleSource.create_or_update( - user=user, - url=xml_url, - source_date=source_date, - force_update=force_update, - am_article=am_article, - ) - article_source.complete_data( - user=user, - force_update=force_update, - auto_solve_pid_conflict=auto_solve_pid_conflict, - ) - - if article_source.status != ArticleSource.StatusChoices.COMPLETED: - return - - # Processa o XML - task_load_article_from_pp_xml.delay( - pp_xml_id=article_source.pid_provider_xml.id, - user_id=user_id or user.id, - username=username or user.username, - force_update=force_update, - ) - - except Exception as e: + user = _get_user(self.request, username, user_id) + article = Article.objects.get(id=article_id) + article.check_availability(user) + except Exception as exception: + logging.exception(f"Error processing article ID {article_id}: {str(exception)}") exc_type, exc_value, exc_traceback = sys.exc_info() UnexpectedEvent.create( - exception=e, + exception=exception, exc_traceback=exc_traceback, detail={ - "task": "task_load_article_from_xml_url", - "xml_url": xml_url, - "source_date": source_date, + "task": "article.tasks.task_check_article_availability", + "article_id": article_id, + "collection_acron_list": collection_acron_list, + "timeout": timeout, + "is_activate": is_activate, "force_update": force_update, }, ) @celery_app.task(bind=True) -def task_select_articles_to_load_from_article_source( +def task_dispatch_articles( self, username=None, user_id=None, + # --- filtros comuns --- + collection_acron_list=None, + journal_acron_list=None, + from_pub_year=None, + until_pub_year=None, from_date=None, until_date=None, force_update=None, - status_list=None, + export_to_articlemeta=False, auto_solve_pid_conflict=None, + # --- ativa pid_provider --- + proc_status_list=None, + # --- ativa article --- + data_status_list=None, + # --- ativa harvest (qualquer um) --- + limit=None, + timeout=None, + opac_url=None, + # --- ativa article_source --- + article_source_status_list=None, ): """ - Processa ArticleSources pendentes ou que necessitam reprocessamento. - - Busca ArticleSources com status pendente ou erro e processa seus XMLs. - Útil para reprocessar falhas anteriores ou completar processamentos interrompidos. - - Args: - self: Instância da tarefa Celery - username (str, optional): Nome do usuário executando a tarefa - user_id (int, optional): ID do usuário executando a tarefa - from_date (str, optional): Data inicial para filtrar ArticleSources - until_date (str, optional): Data final para filtrar ArticleSources - force_update (bool, optional): Força reprocessamento de todos - auto_solve_pid_conflict (bool, optional): Resolve conflitos de PID - - Returns: - None - - Side Effects: - - Processa XMLs de ArticleSources selecionados - - Atualiza status dos ArticleSources - - Registra UnexpectedEvent em caso de erro - - Examples: - # Reprocessar falhas dos últimos 7 dias - task_select_articles_to_load_from_article_source.delay( - from_date=(datetime.now() - timedelta(days=7)).isoformat(), - force_update=True - ) - """ - try: - user = _get_user(self.request, username=username, user_id=user_id) - - # Obtém queryset de ArticleSources para processar - for article_source in ArticleSource.get_queryset_to_complete_data( - from_date, - until_date, - force_update, - status_list, - ): - - try: - # Processa o XML - article_source.complete_data( - user=user, - force_update=force_update, - auto_solve_pid_conflict=auto_solve_pid_conflict, - ) - if article_source.status != ArticleSource.StatusChoices.COMPLETED: - continue - - task_load_article_from_pp_xml.delay( - pp_xml_id=article_source.pid_provider_xml.id, - user_id=user_id or user.id, - username=username or user.username, - force_update=force_update, - ) - except Exception as exception: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=exception, - exc_traceback=exc_traceback, - detail={ - "task": "article.tasks.task_select_articles_to_load_from_article_source", - "article_source_id": str(article_source.id), - }, - ) - - except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - detail={ - "task": "task_select_articles_to_load_from_article_source", - "from_date": from_date, - "until_date": until_date, - "force_update": force_update, - }, - ) - - -@celery_app.task(bind=True) -def task_fix_article_status( - self, - username=None, - user_id=None, - collection_acron_list=None, - journal_acron_list=None, - mark_as_invalid=False, - mark_as_public=False, - mark_as_duplicated=False, - deduplicate=False, -): - """ - Marca artigos com diferentes status baseado em filtros de coleções e periódicos. + Tarefa orquestradora que dispara processamento em lote de artigos. - Processa artigos aplicando diferentes marcações de status conforme parâmetros. - Itera diretamente pelos periódicos, usando coleção apenas como filtro. + Utiliza ArticleIteratorBuilder para selecionar artigos baseado em + múltiplos critérios e dispara task_process_article_pipeline para + cada item encontrado, permitindo processamento paralelo. Args: self: Instância da tarefa Celery username (str, optional): Nome do usuário executando a tarefa user_id (int, optional): ID do usuário executando a tarefa - collection_acron_list (list, optional): Lista de acrônimos de coleções para filtrar - journal_acron_list (list, optional): Lista de acrônimos de periódicos - mark_as_invalid (bool): Se True, marca artigos como invalid - mark_as_public (bool): Se True, marca artigos como public - mark_as_duplicated (bool): Se True, marca artigos como duplicated - deduplicate (bool): Se True, marca artigos como deduplicated + collection_acron_list (list, optional): Filtro por acrônimos de coleções + journal_acron_list (list, optional): Filtro por acrônimos de periódicos + from_pub_year (int, optional): Ano inicial de publicação + until_pub_year (int, optional): Ano final de publicação + from_date (str, optional): Data inicial (formato ISO) + until_date (str, optional): Data final (formato ISO) + force_update (bool, optional): Força reprocessamento + export_to_articlemeta (bool): Exporta para ArticleMeta após processamento + auto_solve_pid_conflict (bool, optional): Resolve conflitos de PID automaticamente + proc_status_list (list, optional): Status do pid_provider para filtro + data_status_list (list, optional): Status do article para filtro + limit (int, optional): Limite máximo de artigos a processar + timeout (int, optional): Timeout para operações HTTP + opac_url (str, optional): URL base do OPAC para harvest + article_source_status_list (list, optional): Status do article_source para filtro Returns: - dict: Resumo da operação com contadores - - Side Effects: - - Altera status de artigos no banco - - Registra UnexpectedEvent em caso de erro - - Dispara subtarefas para cada periódico + dict: Resumo com contadores de dispatched/skipped Examples: - # Marcar artigos como invalid para coleções específicas - task_fix_article_records_status.delay( - collection_acron_list=["scl", "mex"], - journal_acron_list=["abc", "xyz"], - mark_as_invalid=True + # Processamento padrão por coleção + task_dispatch_articles.delay(collection_acron_list=["scl"]) + + # Múltiplas fontes simultaneamente + task_dispatch_articles.delay( + proc_status_list=["todo"], + data_status_list=["invalid"], + article_source_status_list=["error"], + limit=500 ) - # Marcar artigos como public e deduplicated - task_fix_article_records_status.delay( - journal_acron_list=["abc"], - mark_as_public=True, - deduplicate=True - ) + Notes: + - Ver ArticleIteratorBuilder para detalhes sobre iteradores ativados + - Cada artigo encontrado gera uma subtarefa independente """ try: user = _get_user(self.request, username=username, user_id=user_id) - # Validação: ao menos uma operação deve ser especificada - operations = { - "invalid": mark_as_invalid, - "public": mark_as_public, - "duplicated": mark_as_duplicated, - "deduplicated": deduplicate, + common_kwargs = { + "user_id": user.id, + "username": user.username, + "force_update": force_update, + "export_to_articlemeta": export_to_articlemeta, + "auto_solve_pid_conflict": auto_solve_pid_conflict, } - if not any(operations.values()): - raise ValueError("At least one marking operation must be specified") - - # Construir filtros para os periódicos - journal_id_list = Journal.get_ids(collection_acron_list, journal_acron_list) + dispatched = skipped = 0 - # Iterar pelos periódicos e disparar subtarefas - journals_processed = 0 - for journal_id in journal_id_list: - qs = Article.objects.filter(journal_id=journal_id) - if qs.count() == 0: + for item_kwargs in controller.ArticleIteratorBuilder( + user=user, + collection_acron_list=collection_acron_list, + journal_acron_list=journal_acron_list, + from_pub_year=from_pub_year, + until_pub_year=until_pub_year, + from_date=from_date, + until_date=until_date, + proc_status_list=proc_status_list, + data_status_list=data_status_list, + article_source_status_list=article_source_status_list, + limit=limit, + timeout=timeout, + opac_url=opac_url, + force_update=force_update, + ): + if item_kwargs is None: + skipped += 1 continue - task_fix_journal_articles_status.apply_async( - kwargs={ - "username": username, - "user_id": user_id, - "journal_id": journal_id, - "mark_as_invalid": mark_as_invalid, - "mark_as_public": mark_as_public, - "mark_as_duplicated": mark_as_duplicated, - "deduplicate": deduplicate, - } - ) - journals_processed += 1 + task_process_article_pipeline.delay(**item_kwargs, **common_kwargs) + dispatched += 1 return { "status": "success", - "journals_processed": journals_processed, - "operations": {k: v for k, v in operations.items() if v}, - "filters": { - "collections": collection_acron_list, - "journals": journal_acron_list, - }, + "dispatched": dispatched, + "skipped": skipped, } except Exception as e: @@ -975,386 +815,171 @@ def task_fix_article_status( exception=e, exc_traceback=exc_traceback, detail={ - "task": "task_fix_article_records_status", + "task": "task_dispatch_articles", "collection_acron_list": collection_acron_list, "journal_acron_list": journal_acron_list, - "operations": { - "mark_as_invalid": mark_as_invalid, - "mark_as_public": mark_as_public, - "mark_as_duplicated": mark_as_duplicated, - "deduplicate": deduplicate, - }, + "from_pub_year": from_pub_year, + "until_pub_year": until_pub_year, + "from_date": from_date, + "until_date": until_date, + "proc_status_list": proc_status_list, + "data_status_list": data_status_list, + "article_source_status_list": article_source_status_list, + "force_update": force_update, + "export_to_articlemeta": export_to_articlemeta, }, ) raise - @celery_app.task(bind=True) -def task_fix_journal_articles_status( +def task_process_article_pipeline( self, - username=None, - user_id=None, - journal_id=None, + # Entrada para fluxo A (XML URL → ArticleSource → PidProviderXML) + xml_url=None, collection_acron=None, - journal_acron=None, - mark_as_invalid=False, - mark_as_public=False, - mark_as_duplicated=False, - deduplicate=False, + pid=None, + source_date=None, + # Entrada para fluxo B (ArticleSource existente → PidProviderXML) + article_source_id=None, + # Entrada direta para etapa C (PidProviderXML → Article) + pp_xml_id=None, + # Controle do fluxo + export_to_articlemeta=False, + collection_acron_list=None, + force_update=None, + auto_solve_pid_conflict=None, + version=None, + user_id=None, + username=None, ): """ - Marca artigos com diferentes status para um periódico específico. + Pipeline principal de processamento de artigos com múltiplos pontos de entrada. - Processa artigos do periódico aplicando as marcações de status especificadas. - Cada operação de marcação é executada independentemente se habilitada. + Implementa um pipeline flexível que pode iniciar em diferentes estágios: + - Fluxo A: XML URL → ArticleSource → PidProviderXML → Article + - Fluxo B: ArticleSource existente → PidProviderXML → Article + - Fluxo C: PidProviderXML → Article (entrada direta) Args: self: Instância da tarefa Celery - username (str, optional): Nome do usuário executando a tarefa + xml_url (str, optional): URL do XML para fluxo A (requer collection_acron e pid) + collection_acron (str, optional): Acrônimo da coleção (obrigatório com xml_url) + pid (str, optional): PID do artigo (obrigatório com xml_url) + source_date (datetime, optional): Data da fonte para fluxo A + article_source_id (int, optional): ID do ArticleSource para fluxo B + pp_xml_id (int, optional): ID do PidProviderXML para fluxo C + export_to_articlemeta (bool): Se True, exporta para ArticleMeta após processamento + collection_acron_list (list, optional): Lista de coleções para exportação + force_update (bool, optional): Força reprocessamento mesmo se existir + auto_solve_pid_conflict (bool, optional): Resolve conflitos de PID automaticamente + version (str, optional): Versão específica a processar user_id (int, optional): ID do usuário executando a tarefa - journal_id (int, optional): ID do periódico (preferencial por performance) - journal_acron (str, optional): Acrônimo do periódico (alternativa ao journal_id) - mark_as_invalid (bool): Se True, marca artigos sem registro ativo como invalid - mark_as_public (bool): Se True, marca artigos como public - mark_as_duplicated (bool): Se True, marca artigos como duplicated - deduplicate (bool): Se True, marca artigos como deduplicated + username (str, optional): Nome do usuário executando a tarefa Returns: - dict: Resumo das operações realizadas - - Raises: - ValueError: Se nem journal_id nem journal_acron forem fornecidos + None Side Effects: - - Altera status de artigos no banco + - Cria/atualiza ArticleSource (fluxo A) + - Cria/atualiza PidProviderXML + - Cria/atualiza Article + - Verifica disponibilidade do artigo + - Exporta para ArticleMeta se solicitado - Registra UnexpectedEvent em caso de erro - - Pode executar múltiplas operações de marcação em sequência - - """ - try: - # Validar que ao menos um identificador foi fornecido - if not journal_id and not journal_acron: - raise ValueError("Either journal_id or journal_acron must be provided") - - user = _get_user(self.request, username=username, user_id=user_id) - - # Buscar o periódico por ID ou acrônimo - if journal_acron and collection_acron: - journal_ids = Journal.get_ids( - [collection_acron], - [journal_acron], - ) - elif journal_id: - journal_ids = [journal_id] - else: - raise ValueError("Insufficient data to identify the journal") - - if Article.objects.filter(journal__id__in=journal_ids).count() == 0: - return { - "status": "no_articles", - "journal_id": journal_id, - "journal_acron": journal_acron, - "collection_acron": collection_acron, - } - journal_id = journal_ids[0] - if mark_as_invalid: - Article.mark_items_as_invalid(journal_id=journal_id) - - if mark_as_public: - Article.mark_items_as_public(journal_id=journal_id) - if mark_as_duplicated or deduplicate: - Article.deduplicate_items(user, journal_id=journal_id, mark_as_duplicated=mark_as_duplicated, deduplicate=deduplicate) - - return { - "status": "success", - "journal_id": journal_id, - "journal_acron": journal_acron, - "collection_acron": collection_acron, - "operations_performed": { - "mark_as_invalid": mark_as_invalid, - "mark_as_public": mark_as_public, - "mark_as_duplicated": mark_as_duplicated, - "deduplicate": deduplicate, - }, - } + Raises: + ValueError: Se nenhum ponto de entrada válido for fornecido + Se xml_url fornecido sem collection_acron ou pid - except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - detail={ - "task": "task_fix_journal_articles_status", - "journal_id": journal_id, - "journal_acron": journal_acron, - "collection_acron": collection_acron, - "operations": { - "mark_as_invalid": mark_as_invalid, - "mark_as_public": mark_as_public, - "mark_as_duplicated": mark_as_duplicated, - "deduplicate": deduplicate, - }, - }, + Examples: + # Fluxo completo a partir de URL + task_process_article_pipeline.delay( + xml_url="http://example.com/article.xml", + collection_acron="scl", + pid="S1234-56782024000100001", + export_to_articlemeta=True ) - raise + # A partir de ArticleSource existente + task_process_article_pipeline.delay( + article_source_id=123, + force_update=True + ) -@celery_app.task(bind=True) -def task_load_articles( - self, - username=None, - user_id=None, - collection_acron_list=None, - journal_acron_list=None, - articlemeta_export_enable=None, - from_pub_year=None, - until_pub_year=None, - from_updated_date=None, - until_updated_date=None, - proc_status_list=None, -): - """ - Task para carregar artigos de uma lista selecionada de periódicos. - Dispara subtasks para cada periódico encontrado. + # Entrada direta via PidProviderXML + task_process_article_pipeline.delay( + pp_xml_id=456, + export_to_articlemeta=True + ) """ try: user = _get_user(self.request, username=username, user_id=user_id) + + if xml_url: + if not collection_acron: + raise ValueError("collection_acron is required when xml_url is provided") + if not pid: + raise ValueError("pid is required when xml_url is provided") + am_article = AMArticle.create_or_update( + pid, Collection.get(collection_acron), None, user + ) + if not am_article: + raise ValueError("Failed to create or update AMArticle with pid: {pid} and collection: {collection_acron}") - proc_status_list = proc_status_list or [ - PPXML_STATUS_TODO, - PPXML_STATUS_INVALID, - ] - # Construir filtros para os periódicos - items = Journal.get_journal_issns(collection_acron_list, journal_acron_list) - if items: - # Iterar pelos periódicos e disparar subtarefas - journals_processed = 0 - for journal_issns in items: - # Filtrar ISSNs válidos - issn_list = [issn for issn in journal_issns if issn] - - if not issn_list: # Só dispara task se houver ISSNs - continue - - task_load_journal_articles.delay( - username=username, - user_id=user_id, - issn_list=issn_list, - articlemeta_export_enable=articlemeta_export_enable, - from_pub_year=from_pub_year, - until_pub_year=until_pub_year, - from_updated_date=from_updated_date, - until_updated_date=until_updated_date, - proc_status_list=proc_status_list, - ) - journals_processed += 1 - - return { - "status": "success", - "journals_processed": journals_processed, - "filters": { - "collections": collection_acron_list, - "journals": journal_acron_list, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "proc_status_list": proc_status_list, - }, - } - - else: - # Se não há filtros, processa todos os artigos - task_load_journal_articles.delay( - username=username, - user_id=user_id, - issn_list=None, - articlemeta_export_enable=articlemeta_export_enable, - from_pub_year=from_pub_year, - until_pub_year=until_pub_year, - from_updated_date=from_updated_date, - until_updated_date=until_updated_date, - proc_status_list=proc_status_list, + article_source = ArticleSource.create_or_update( + user=user, + url=xml_url, + source_date=source_date, + force_update=force_update, + am_article=am_article, + auto_solve_pid_conflict=auto_solve_pid_conflict, ) - return { - "status": "success", - "message": "Processing all articles without journal filters", - "filters": { - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "proc_status_list": proc_status_list, - }, - } + pp_xml_id = article_source.pid_provider_xml.id + + if article_source_id: + article_source = ArticleSource.objects.get(id=article_source_id) + article_source.add_pid_provider( + user=user, + force_update=force_update, + auto_solve_pid_conflict=auto_solve_pid_conflict, + ) + pp_xml_id = article_source.pid_provider_xml.id - except Exception as e: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - detail={ - "task": "task_load_articles", - "collection_acron_list": collection_acron_list, - "journal_acron_list": journal_acron_list, - "articlemeta_export_enable": articlemeta_export_enable, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "proc_status_list": proc_status_list, - }, - ) - raise + if not pp_xml_id: + raise ValueError( + "No valid entry point provided. Please provide either xml_url, " + "article_source_id, pp_xml_id or pid_v3." + ) + pp_xml = PidProviderXML.objects.select_related( + "current_version" + ).get(id=pp_xml_id) -@celery_app.task(bind=True) -def task_load_journal_articles( - self, - username=None, - user_id=None, - issn_list=None, - articlemeta_export_enable=False, - from_pub_year=None, - until_pub_year=None, - from_updated_date=None, - until_updated_date=None, - proc_status_list=None, -): - """ - Task para carregar artigos de um periódico específico. - Dispara subtasks para cada artigo encontrado. - """ - try: - user = _get_user(self.request, username=username, user_id=user_id) + article = load_article(user, pp_xml=pp_xml) + pp_xml.collections.set(article.collections) - proc_status_list = proc_status_list or [ - PPXML_STATUS_TODO, - PPXML_STATUS_INVALID, - ] - # Buscar os XMLs usando os ISSNs do periódico - items = PidProviderXML.get_queryset( - issn_list=issn_list, - from_pub_year=from_pub_year, - until_pub_year=until_pub_year, - from_updated_date=from_updated_date, - until_updated_date=until_updated_date, - proc_status_list=proc_status_list, - ) - if not items.exists(): - return { - "status": "success", - "articles_found": 0, - "message": "No articles found with the specified filters", - "filters": { - "issn_list": issn_list, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "proc_status_list": proc_status_list, - }, - } - # Contador de artigos processados - articles_processed = 0 - # Iterar sobre os itens e disparar tasks para cada artigo - for item in items.iterator(): - task_load_article_from_pp_xml.delay( - pp_xml_id=item.id, - user_id=user_id or user.id, - username=username or user.username, - articlemeta_export_enable=articlemeta_export_enable, + if export_to_articlemeta: + article.check_availability(user) + task_export_article_to_articlemeta.delay( + pid_v3=article.pid_v3, + collection_acron_list=collection_acron_list, + force_update=force_update, + user_id=user.id, + username=user.username, ) - articles_processed += 1 - return { - "status": "success", - "articles_processed": articles_processed, - "operations": { - "articlemeta_export_enable": articlemeta_export_enable, - }, - "filters": { - "issn_list": issn_list, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "proc_status_list": proc_status_list, - }, - } - except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, detail={ - "task": "task_load_journal_articles", - "issn_list": issn_list, - "articlemeta_export_enable": articlemeta_export_enable, - "from_pub_year": from_pub_year, - "until_pub_year": until_pub_year, - "from_updated_date": from_updated_date, - "until_updated_date": until_updated_date, - "proc_status_list": proc_status_list, - }, - ) - raise - - -@celery_app.task(bind=True) -def task_check_article_availability( - self, - user_id=None, - username=None, - article_id=None, - collection_acron_list=None, - timeout=None, - is_activate=None, - force_update=False, -): - """ - Carrega um artigo específico a partir de um PidProviderXML. - - Processa o XML armazenado no PidProviderXML, cria/atualiza o Article - e opcionalmente exporta para ArticleMeta. - - Args: - self: Instância da tarefa Celery - pp_xml_id (int): ID do PidProviderXML a processar (obrigatório) - user_id (int, optional): ID do usuário executando a tarefa - username (str, optional): Nome do usuário executando a tarefa - articlemeta_export_enable (bool, optional): Exporta para ArticleMeta após carregar - - Returns: - None - - Side Effects: - - Cria/atualiza Article no banco - - Atualiza status do PidProviderXML para DONE - - Verifica disponibilidade do artigo - - Exporta para ArticleMeta se solicitado - - Registra UnexpectedEvent em caso de erro - - Notes: - - O XML é lido diretamente do arquivo armazenado no PidProviderXML - - A verificação de disponibilidade valida URLs e assets do artigo - """ - try: - user = _get_user(self.request, username, user_id) - article = Article.objects.get(id=article_id) - article.check_availability(user) - except Exception as exception: - exc_type, exc_value, exc_traceback = sys.exc_info() - UnexpectedEvent.create( - exception=exception, - exc_traceback=exc_traceback, - detail={ - "task": "article.tasks.task_check_article_availability", - "article_id": article_id, - "collection_acron_list": collection_acron_list, - "timeout": timeout, - "is_activate": is_activate, + "task": "article.tasks.task_process_article_pipeline", + "xml_url": xml_url, + "article_source_id": article_source_id, + "pp_xml_id": pp_xml_id, + "pid": pid, + "collection_acron": collection_acron, + "export_to_articlemeta": export_to_articlemeta, "force_update": force_update, }, - ) \ No newline at end of file + ) From 0dfec26dbe0f29255f07b8a6fd3bd8eaac953197 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:28:06 -0300 Subject: [PATCH 16/27] issue: renomeia task load_issue_from_article_meta para load_issue_from_articlemeta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ajusta nome da função Celery e as três referências a action= em UnexpectedEvent.create para o novo nome sem underscore antes de 'meta'. --- issue/tasks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/issue/tasks.py b/issue/tasks.py index 0541f32eb..bd1f2391a 100644 --- a/issue/tasks.py +++ b/issue/tasks.py @@ -19,7 +19,7 @@ @celery_app.task(bind=True) -def load_issue_from_article_meta( +def load_issue_from_articlemeta( self, user_id=None, username=None, @@ -71,7 +71,7 @@ def load_issue_from_article_meta( UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, - action="issue.tasks.load_issue_from_article_meta.schedule_task_load_issue", + action="issue.tasks.load_issue_from_articlemeta.schedule_task_load_issue", detail={ "collection_acron": acron3, "issue_identifier": issue_identifier, @@ -83,7 +83,7 @@ def load_issue_from_article_meta( UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, - action="issue.tasks.load_issue_from_article_meta.process_collection", + action="issue.tasks.load_issue_from_articlemeta.process_collection", detail={ "collection_acron": acron3, "from_date": from_date, @@ -97,7 +97,7 @@ def load_issue_from_article_meta( UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, - action="issue.tasks.load_issue_from_article_meta", + action="issue.tasks.load_issue_from_articlemeta", detail={ "collection_acron": collection_acron, "from_date": from_date, From 51890a94e1798f2e0196bbbfa5a268895e7a7ea6 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:28:06 -0300 Subject: [PATCH 17/27] journal: remove classname='collapsed' dos InlinePanels no admin de Journal e SciELOJournal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Todos os InlinePanel definidos nos grupos de painéis de Journal (other_titles, mission, history, focus, thematic_area, title_in_database, owner_history, publisher_history, sponsor_history, copyright_holder_history, related_journal_urls, open_science_form_files, open_access_text, open_data, preprint, peer_review, open_science_compliance, notes) e de SciELOJournal (journal_history) têm o atributo classname='collapsed' removido, exibindo os painéis expandidos por padrão no Wagtail admin. --- journal/models.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/journal/models.py b/journal/models.py index f3c038652..be5849d37 100755 --- a/journal/models.py +++ b/journal/models.py @@ -627,15 +627,15 @@ def autocomplete_custom_queryset_filter(search_term): AutocompletePanel("official"), FieldPanel("title"), FieldPanel("short_title"), - InlinePanel("other_titles", label=_("Other titles"), classname="collapsed"), + InlinePanel("other_titles", label=_("Other titles")), ] panels_scope_and_about = [ - InlinePanel("mission", label=_("Mission"), classname="collapsed"), - InlinePanel("history", label=_("Brief History"), classname="collapsed"), - InlinePanel("focus", label=_("Focus and Scope"), classname="collapsed"), + InlinePanel("mission", label=_("Mission")), + InlinePanel("history", label=_("Brief History")), + InlinePanel("focus", label=_("Focus and Scope")), AutocompletePanel("subject"), - InlinePanel("thematic_area", label=_("Thematic Areas"), classname="collapsed"), + InlinePanel("thematic_area", label=_("Thematic Areas")), AutocompletePanel("subject_descriptor"), AutocompletePanel("wos_area"), AutocompletePanel("wos_db"), @@ -643,14 +643,14 @@ def autocomplete_custom_queryset_filter(search_term): AutocompletePanel("additional_indexed_at"), AutocompletePanel("vocabulary"), InlinePanel( - "title_in_database", label=_("Title in Database"), classname="collapsed" + "title_in_database", label=_("Title in Database") ), ] panels_institutions = [ - InlinePanel("owner_history", label=_("Owner"), classname="collapsed"), - InlinePanel("publisher_history", label=_("Publisher"), classname="collapsed"), - InlinePanel("sponsor_history", label=_("Sponsor"), classname="collapsed"), + InlinePanel("owner_history", label=_("Owner")), + InlinePanel("publisher_history", label=_("Publisher")), + InlinePanel("sponsor_history", label=_("Sponsor")), InlinePanel( "copyright_holder_history", label=_("Copyright Holder"), @@ -666,7 +666,7 @@ def autocomplete_custom_queryset_filter(search_term): FieldPanel("logo", heading=_("Logo")), # FieldPanel("journal_url"), InlinePanel( - "related_journal_urls", label=_("Journal Urls"), classname="collapsed" + "related_journal_urls", label=_("Journal Urls") ), FieldPanel("submission_online_url"), FieldPanel("main_collection"), @@ -680,13 +680,13 @@ def autocomplete_custom_queryset_filter(search_term): FieldPanel("open_access"), FieldPanel("url_oa"), InlinePanel( - "open_science_form_files", label=_("Open Science accordance form"), classname="collapsed" + "open_science_form_files", label=_("Open Science accordance form") ), FieldPanel("journal_use_license"), - InlinePanel("open_access_text", label=_("Open Access"), classname="collapsed"), - InlinePanel("open_data", label=_("Open data"), classname="collapsed"), - InlinePanel("preprint", label=_("Preprint"), classname="collapsed"), - InlinePanel("peer_review", label=_("Peer review"), classname="collapsed"), + InlinePanel("open_access_text", label=_("Open Access")), + InlinePanel("open_data", label=_("Open data")), + InlinePanel("preprint", label=_("Preprint")), + InlinePanel("peer_review", label=_("Peer review")), InlinePanel( "open_science_compliance", label=_("Open Science Compliance"), @@ -694,7 +694,7 @@ def autocomplete_custom_queryset_filter(search_term): ), ] - panels_notes = [InlinePanel("notes", label=_("Notes"), classname="collapsed")] + panels_notes = [InlinePanel("notes", label=_("Notes"))] panels_legacy_compatibility_fields = [ FieldPanel("alphabet"), @@ -2192,7 +2192,7 @@ def __str__(self): FieldPanel("status"), AutocompletePanel("collection"), InlinePanel( - "journal_history", label=_("Journal History"), classname="collapsed" + "journal_history", label=_("Journal History") ), ] From 36baa099b29da406e1029b0d4b3eed223d7f6137 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:28:06 -0300 Subject: [PATCH 18/27] journal: adiciona campo 'updated' ao list_display de AMJournalAdmin, IndexedAtAdmin, AdditionalIndexedAtAdmin, WebOfKnowledgeAdmin, SubjectAdmin, WosAreaAdmin e StandardAdmin --- journal/wagtail_hooks.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/journal/wagtail_hooks.py b/journal/wagtail_hooks.py index 1badc4c7d..a5632a4ed 100755 --- a/journal/wagtail_hooks.py +++ b/journal/wagtail_hooks.py @@ -398,7 +398,7 @@ class AMJournalAdmin(SnippetViewSet): menu_label = _("AM Journal") menu_icon = "folder" menu_order = get_menu_order("amjournal") - list_display = ("pid", "collection", "processing_date", "status") + list_display = ("pid", "collection", "processing_date", "status", "updated") list_filter = ("collection", "status") search_fields = ("pid",) @@ -556,7 +556,7 @@ class IndexedAtAdmin(SnippetViewSet): menu_icon = "folder" menu_order = 100 add_to_settings_menu = False - list_display = ("name", "acronym", "url", "description", "type") + list_display = ("name", "acronym", "url", "description", "type", "updated") list_filter = ("type",) search_fields = ("name", "acronym") list_export = ("name", "acronym", "url", "description", "type") @@ -569,7 +569,7 @@ class AdditionalIndexedAtAdmin(SnippetViewSet): menu_icon = "folder" menu_order = 110 add_to_settings_menu = False - list_display = ("name",) + list_display = ("name", "updated") search_fields = ("name",) @@ -593,6 +593,7 @@ class WebOfKnowledgeAdmin(SnippetViewSet): list_display = ( "code", "value", + "updated", ) search_fields = ( @@ -609,6 +610,7 @@ class SubjectAdmin(SnippetViewSet): list_display = ( "code", "value", + "updated", ) search_fields = ( @@ -622,7 +624,7 @@ class WosAreaAdmin(SnippetViewSet): menu_icon = "folder" menu_order = 400 add_to_settings_menu = False - list_display = ("value",) + list_display = ("value", "updated") search_fields = ("value",) @@ -634,6 +636,7 @@ class StandardAdmin(SnippetViewSet): list_display = ( "code", "value", + "updated", ) search_fields = ( From 59f95f8ba845459fe085f8052d92ceedc01a92ae Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 12:28:06 -0300 Subject: [PATCH 19/27] =?UTF-8?q?bigbang:=20atualiza=20agendador=20?= =?UTF-8?q?=E2=80=94=20substitui=20tasks=20obsoletas=20por=20task=5Fdispat?= =?UTF-8?q?ch=5Farticles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit delete_outdated_tasks: adiciona ao inventário de limpeza todas as tasks removidas neste ciclo de refatoração: task_select_articles_to_complete_data, task_select_articles_to_load_from_api, task_select_articles_to_load_from_collection_endpoint, task_select_articles_to_load_from_article_source, task_load_articles, task_load_journal_articles, task_load_article_from_xml_url, task_create_article_source, task_create_pid_provider_xml, task_fix_journal_articles_status, task_select_articles_to_export_to_articlemeta, issue.tasks.load_issue_from_article_meta (legacy). schedule_tasks: chama delete_outdated_tasks no início; substitui schedule_task_select_articles_to_complete_data, schedule_task_select_articles_to_load_from_api, schedule_task_select_articles_to_load_from_article_source e schedule_task_load_articles pela nova schedule_task_dispatch_articles (horário 02:01); remove schedule_bigbang_delete_outdated_tasks (chamada manual no início do agendamento); atualiza referência da task de issue para load_issue_from_articlemeta com parâmetros simplificados. schedule_task_dispatch_articles: agenda task_dispatch_articles com todos os parâmetros disponíveis (collection, journal, datas, proc/data/ article_source status lists, limit, timeout, opac_url, export_to_articlemeta, auto_solve_pid_conflict). schedule_task_export_articles_to_articlemeta: atualiza nome da task de task_select_articles_to_export_to_articlemeta para task_export_articles_to_articlemeta. --- bigbang/tasks_scheduler.py | 169 ++++++++++++------------------------- 1 file changed, 56 insertions(+), 113 deletions(-) diff --git a/bigbang/tasks_scheduler.py b/bigbang/tasks_scheduler.py index 4c6ee8e82..d2905f3b5 100644 --- a/bigbang/tasks_scheduler.py +++ b/bigbang/tasks_scheduler.py @@ -31,6 +31,18 @@ def delete_outdated_tasks(task_list=None): "article.tasks.task_convert_xml_to_other_formats_for_articles", "article.tasks.convert_xml_to_other_formats", "article.tasks.task_load_article_from_xml_endpoint", + "article.tasks.task_select_articles_to_complete_data", + "article.tasks.task_select_articles_to_load_from_api", + "article.tasks.task_select_articles_to_load_from_collection_endpoint", + "article.tasks.task_select_articles_to_load_from_article_source", + "article.tasks.task_load_articles", + "article.tasks.task_load_journal_articles", + "article.tasks.task_load_article_from_xml_url", + "article.tasks.task_create_article_source", + "article.tasks.task_create_pid_provider_xml", + "article.tasks.task_fix_journal_articles_status", + "article.tasks.task_select_articles_to_export_to_articlemeta", + "issue.tasks.load_issue_from_article_meta", # Tarefas de Article sem namespace (legacy) "article_complete_data", @@ -47,6 +59,17 @@ def delete_outdated_tasks(task_list=None): "task_load_article_from_article_source", "task_mark_articles_as_deleted_without_pp_xml", "transfer_license_statements_fk_to_article_license", + "task_select_articles_to_complete_data", + "task_select_articles_to_load_from_api", + "task_select_articles_to_load_from_collection_endpoint", + "task_select_articles_to_load_from_article_source", + "task_load_articles", + "task_load_journal_articles", + "task_load_article_from_xml_url", + "task_create_article_source", + "task_create_pid_provider_xml", + "task_fix_journal_articles_status", + "task_select_articles_to_export_to_articlemeta", ] delete_tasks(task_list) @@ -61,18 +84,17 @@ def schedule_tasks(username): """ enabled = False + delete_outdated_tasks() + # Tarefas de Article mantidas - schedule_task_select_articles_to_complete_data(username, enabled) + schedule_task_dispatch_articles(username, enabled) schedule_task_export_articles_to_articlemeta(username, enabled) - schedule_task_select_articles_to_load_from_api(username, enabled) - schedule_task_select_articles_to_load_from_article_source(username, enabled) schedule_task_fix_article_status(username, enabled) - schedule_task_load_articles(username, enabled) # Tarefas de issue schedule_export_issue_to_articlemeta(username, enabled) schedule_export_issues_to_articlemeta(username, enabled) - schedule_load_issue_from_article_meta(username, enabled) + schedule_load_issue_from_articlemeta(username, enabled) # Tarefas de journal schedule_export_journal_to_articlemeta(username, enabled) @@ -87,42 +109,46 @@ def schedule_tasks(username): # Tarefas de bigbang schedule_bigbang_start(username, enabled) - schedule_bigbang_delete_outdated_tasks(username, enabled) # ============================================================================== # TAREFAS DE ARTICLE MANTIDAS # ============================================================================== -def schedule_task_select_articles_to_complete_data(username, enabled=False): +def schedule_task_dispatch_articles(username, enabled=False): """ - Agenda a tarefa de completar dados de artigos incompletos + Agenda a tarefa orquestradora de despacho de artigos para o pipeline. + Substitui as antigas tarefas de seleção (complete_data, load_from_api, + load_from_article_source, load_articles). """ schedule_task( - task="article.tasks.task_select_articles_to_complete_data", - name="article.tasks.task_select_articles_to_complete_data", + task="article.tasks.task_dispatch_articles", + name="article.tasks.task_dispatch_articles", kwargs=dict( - user_id=None, username=username, - collection_acron_list=[], - journal_acron_list=[], + user_id=None, + collection_acron_list=None, + journal_acron_list=None, from_pub_year=None, until_pub_year=None, + from_date=None, + until_date=None, force_update=False, - from_updated_date=None, - until_updated_date=None, - data_status_list=[], - valid=None, - pp_xml__isnull=True, - sps_pkg_name__isnull=True, - article_license__isnull=True, + export_to_articlemeta=False, + auto_solve_pid_conflict=False, + proc_status_list=None, + data_status_list=None, + limit=None, + timeout=None, + opac_url=None, + article_source_status_list=None, ), - description=_("Complete missing data for articles"), + description=_("Dispatch articles to processing pipeline"), priority=TASK_PRIORITY, enabled=enabled, run_once=False, day_of_week="*", - hour="6", + hour="2", minute="1", ) @@ -132,8 +158,8 @@ def schedule_task_export_articles_to_articlemeta(username, enabled=False): Agenda a tarefa de exportar artigos em lote para ArticleMeta """ schedule_task( - task="article.tasks.task_select_articles_to_export_to_articlemeta", - name="article.tasks.task_select_articles_to_export_to_articlemeta", + task="article.tasks.task_export_articles_to_articlemeta", + name="article.tasks.task_export_articles_to_articlemeta", kwargs=dict( collection_acron_list=[], issn=None, @@ -157,59 +183,8 @@ def schedule_task_export_articles_to_articlemeta(username, enabled=False): ) -def schedule_task_select_articles_to_load_from_api(username, enabled=False): - """ - Agenda a tarefa de carregar artigos de múltiplas coleções via API - """ - schedule_task( - task="article.tasks.task_select_articles_to_load_from_api", - name="article.tasks.task_select_articles_to_load_from_api", - kwargs=dict( - username=username, - user_id=None, - collection_acron_list=["scl"], - from_date="2024-01-01", - until_date="2024-02-31", - limit=100, - timeout=10, - force_update=False, - auto_solve_pid_conflict=False, - opac_url=None, - ), - description=_("Load articles from multiple collections via API"), - priority=TASK_PRIORITY, - enabled=enabled, - run_once=False, - day_of_week="*", - hour="3", - minute="1", - ) -def schedule_task_select_articles_to_load_from_article_source(username, enabled=False): - """ - Agenda a tarefa de processar ArticleSources pendentes - """ - schedule_task( - task="article.tasks.task_select_articles_to_load_from_article_source", - name="task_select_articles_to_load_from_article_source", - kwargs=dict( - username=username, - user_id=None, - from_date="2024-01-01", - until_date="2024-12-31", - force_update=False, - auto_solve_pid_conflict=False, - ), - description=_("Process pending ArticleSources"), - priority=TASK_PRIORITY, - enabled=enabled, - run_once=False, - day_of_week="*", - hour="*/6", - minute="30", - ) - def schedule_task_fix_article_status(username, enabled=False): """ @@ -241,35 +216,6 @@ def schedule_task_fix_article_status(username, enabled=False): ) -def schedule_task_load_articles(username, enabled=False): - """ - Agenda a tarefa de carregar artigos do PidProviderXML - """ - schedule_task( - task="article.tasks.task_load_articles", - name="article.tasks.task_load_articles", - kwargs=dict( - username=None, - user_id=None, - collection_acron_list=None, - journal_acron_list=None, - articlemeta_export_enable=None, - from_pub_year=None, - until_pub_year=None, - from_updated_date=None, - until_updated_date=None, - proc_status_list=None, - ), - description=_("Load articles from PidProviderXML"), - priority=TASK_PRIORITY, - enabled=enabled, - run_once=False, - day_of_week="*", - hour="2", - minute="1", - ) - - # ============================================================================== # TAREFAS DE BIGBANG # ============================================================================== @@ -462,24 +408,21 @@ def schedule_export_journal_to_articlemeta(username, enabled=False): # TAREFAS DE ISSUE # ============================================================================== -def schedule_load_issue_from_article_meta(username, enabled=False): +def schedule_load_issue_from_articlemeta(username, enabled=False): """ Agenda a tarefa de carregar issues do ArticleMeta """ schedule_task( - task="issue.tasks.task_load_issue_from_article_meta", - name="task_load_issue_from_article_meta", + task="issue.tasks.load_issue_from_articlemeta", + name="load_issue_from_articlemeta", kwargs=dict( user_id=None, username=username, - collection="scl", - issn_scielo="0034-8910", + collection_acron=None, from_date=None, until_date=None, - limit=None, - force_update=False, - timeout=None, - reset=None, + force_update=None, + timeout=30, ), description=_("Load issues from ArticleMeta"), priority=TASK_PRIORITY, From 231c72b6f7995e1b349e56e565ccd31b46276b40 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka <505143+robertatakenaka@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:32:51 -0300 Subject: [PATCH 20/27] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- article/search_indexes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/article/search_indexes.py b/article/search_indexes.py index 48507e1d0..c14cb2a8d 100644 --- a/article/search_indexes.py +++ b/article/search_indexes.py @@ -476,7 +476,8 @@ def prepare_dates(self, obj): """This the publication date, that is format by YYYY-MM-DD In the model this field is seperated into pub_date_day, pub_date_month and pub_date_year """ - return obj.pub_date + if obj.pub_date: + return set([obj.pub_date]) def prepare_la(self, obj): """The language of the article.""" From 8bbd91ac911a105d9539ed4bd86ed47c555fc59c Mon Sep 17 00:00:00 2001 From: Roberta Takenaka <505143+robertatakenaka@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:36:25 -0300 Subject: [PATCH 21/27] Update article/controller.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- article/controller.py | 1 - 1 file changed, 1 deletion(-) diff --git a/article/controller.py b/article/controller.py index becd033a9..a9464dcc3 100644 --- a/article/controller.py +++ b/article/controller.py @@ -1,5 +1,4 @@ import csv -import itertools import json import logging import sys From acb98b6957b554ad769b8aaea44ebc5bd09aacb8 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka <505143+robertatakenaka@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:38:28 -0300 Subject: [PATCH 22/27] Update article/tasks.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- article/tasks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/article/tasks.py b/article/tasks.py index 3a7550e70..f7a76bb92 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -924,7 +924,9 @@ def task_process_article_pipeline( pid, Collection.get(collection_acron), None, user ) if not am_article: - raise ValueError("Failed to create or update AMArticle with pid: {pid} and collection: {collection_acron}") + raise ValueError( + f"Failed to create or update AMArticle with pid: {pid} and collection: {collection_acron}" + ) article_source = ArticleSource.create_or_update( user=user, From 3d7a86248555916d5bdd856c52a8fc1992658588 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Thu, 12 Mar 2026 18:10:20 -0300 Subject: [PATCH 23/27] solr: adiciona campo indexed_at ao schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adiciona campo string multiValuado 'indexed_at' ao schema do Solr para rastrear a data de indexação por documento. --- index/7.7.3/core/conf/schema.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/7.7.3/core/conf/schema.xml b/index/7.7.3/core/conf/schema.xml index ddd7c97aa..b6844b027 100755 --- a/index/7.7.3/core/conf/schema.xml +++ b/index/7.7.3/core/conf/schema.xml @@ -152,12 +152,13 @@ + - +