Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1d26085
Adiciona migrações de article, core/home, instituition, organization,…
robertatakenaka Mar 12, 2026
49013bd
journal: corrige duplicação da constante UPDATE_POLICY_TYPE em choice…
robertatakenaka Mar 12, 2026
69631e4
i18n: atualiza tradução em espanhol de 'Publisher' para 'Entidad Edit…
robertatakenaka Mar 12, 2026
ff49ab0
collection: adiciona cached_property base_url ao modelo Collection
robertatakenaka Mar 12, 2026
3398922
core: move CommonControlFieldViewSet de viewsets.py para views.py
robertatakenaka Mar 12, 2026
d556d84
pid_provider: adiciona XMLURL, refatora deduplicação e melhora XMLVer…
robertatakenaka Mar 12, 2026
337f979
pid_provider: refatora provide_pid_for_xml_uri com tratamento de erro…
robertatakenaka Mar 12, 2026
79332dd
pid_provider: substitui objects.get por get_by_pid_v3 em harvesting.py
robertatakenaka Mar 12, 2026
1242b12
pid_provider: atualiza import de CommonControlFieldViewSet para core.…
robertatakenaka Mar 12, 2026
ccacb0a
pid_provider: adiciona testes para XMLURL e BasePidProvider.provide_p…
robertatakenaka Mar 12, 2026
d059d82
article: refatora ArticleSource, adiciona exceções tipadas e melhora …
robertatakenaka Mar 12, 2026
f6a8147
article: usa get_by_pid_v3 e armazena sps_pkg_name em variável local …
robertatakenaka Mar 12, 2026
fefde02
article: adiciona campos ao índice de busca e corrige URLs para usar …
robertatakenaka Mar 12, 2026
9cfa046
article: adiciona ArticleIteratorBuilder ao controller para seleção u…
robertatakenaka Mar 12, 2026
c1f49f7
article: consolida tasks em task_dispatch_articles + task_process_art…
robertatakenaka Mar 12, 2026
0dfec26
issue: renomeia task load_issue_from_article_meta para load_issue_fro…
robertatakenaka Mar 12, 2026
51890a9
journal: remove classname='collapsed' dos InlinePanels no admin de Jo…
robertatakenaka Mar 12, 2026
36baa09
journal: adiciona campo 'updated' ao list_display de AMJournalAdmin, …
robertatakenaka Mar 12, 2026
59f95f8
bigbang: atualiza agendador — substitui tasks obsoletas por task_disp…
robertatakenaka Mar 12, 2026
231c72b
Apply suggestion from @Copilot
robertatakenaka Mar 12, 2026
8bbd91a
Update article/controller.py
robertatakenaka Mar 12, 2026
acb98b6
Update article/tasks.py
robertatakenaka Mar 12, 2026
3d7a862
solr: adiciona campo indexed_at ao schema
robertatakenaka Mar 12, 2026
21916e5
harvesters: corrige construção de URL e normaliza valores padrão
robertatakenaka Mar 12, 2026
f64ad0e
article: substitui collection.domain por collection.base_url no const…
robertatakenaka Mar 12, 2026
662de15
article: adiciona contadores de yield e log de resumo ao ArticleItera…
robertatakenaka Mar 12, 2026
17cc3ab
article: refatora chamada de check_availability em task_process_artic…
robertatakenaka Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 210 additions & 14 deletions article/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,21 @@
import logging
import sys
import traceback
from datetime import datetime

from django.db.models import Q
from packtools.sps.formats.am import am

from article.sources.xmlsps import load_article
from article.models import Article, ArticleExporter, ArticleFunding
from article.choices import (
DATA_STATUS_DUPLICATED,
DATA_STATUS_DEDUPLICATED,
DATA_STATUS_PUBLIC,
)
from article.models import Article, ArticleExporter, ArticleFunding, ArticleSource
from article import choices
from collection.models import Collection
from core.mongodb import write_item
from core.utils import date_utils
from core.utils.harvesters import AMHarvester, OPACHarvester
from institution.models import Sponsor
from journal.models import Journal, SciELOJournal
from journal.models import Journal
from pid_provider.choices import (
PPXML_STATUS_TODO,
PPXML_STATUS_DUPLICATED,
PPXML_STATUS_DEDUPLICATED,
PPXML_STATUS_INVALID,
)
from pid_provider.models import PidProviderXML, XMLVersionXmlWithPreError
from pid_provider.models import PidProviderXML
from tracker.models import UnexpectedEvent


Expand Down Expand Up @@ -403,3 +395,207 @@ def bulk_export_articles_to_articlemeta(
},
)
raise


class ArticleIteratorBuilder:
"""
Monta e encadeia iteradores de seleção de artigos para despacho ao pipeline.

Cada método ``_iter_from_*`` é um gerador que yields kwargs prontos para
``task_process_article_pipeline``. Os iteradores ativos são determinados
pelos argumentos exclusivos presentes na instância — múltiplos podem estar
ativos simultaneamente.

Argumentos exclusivos e seus iteradores:

========================= ================================================
Argumento exclusivo Iterador ativado
========================= ================================================
proc_status_list _iter_from_pid_provider
data_status_list _iter_from_article
limit / timeout / opac_url _iter_from_harvest
article_source_status_list _iter_from_article_source
(nenhum) _iter_from_pid_provider (padrão)
========================= ================================================

Usage::

it = ArticleIteratorBuilder(
user=user,
collection_acron_list=["scl"],
proc_status_list=["todo"],
data_status_list=["invalid"],
)
for kwargs in it:
task_process_article_pipeline.delay(**kwargs)
"""

def __init__(
self,
user,
collection_acron_list=None,
journal_acron_list=None,
from_pub_year=None,
until_pub_year=None,
from_date=None,
until_date=None,
proc_status_list=None,
data_status_list=None,
article_source_status_list=None,
limit=None,
timeout=None,
opac_url=None,
force_update=None,
):
self.user = user
self.collection_acron_list = collection_acron_list
self.journal_acron_list = journal_acron_list
self.from_pub_year = from_pub_year
self.until_pub_year = until_pub_year
self.from_date = from_date
self.until_date = until_date
self.proc_status_list = proc_status_list
self.data_status_list = data_status_list
self.article_source_status_list = article_source_status_list
self.limit = limit
self.timeout = timeout
self.opac_url = opac_url
self.force_update = force_update

self._iter_from_harvest_count = 0
self._iter_from_article_source_count = 0
self._iter_from_pid_provider_count = 0
self._iter_from_article_count = 0

def __iter__(self):
yield from self._iter_from_harvest()
yield from self._iter_from_article_source()
yield from self._iter_from_pid_provider()
yield from self._iter_from_article()

Comment on lines +471 to +475
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArticleIteratorBuilder.__iter__ unconditionally yields from all four iterators. This contradicts the class docstring (which says iterators are activated by “exclusive” args) and means every dispatch will also harvest all collections (and also iterate PidProviderXML + Article + ArticleSource), potentially duplicating work and massively increasing load. Make iterator selection conditional so only the requested sources run (and default to only _iter_from_pid_provider when no exclusive args are provided).

Suggested change
yield from self._iter_from_harvest()
yield from self._iter_from_article_source()
yield from self._iter_from_pid_provider()
yield from self._iter_from_article()
"""
Itera sobre artigos de acordo com os argumentos "exclusivos".
A seleção dos iteradores segue a documentação da classe:
- Se `opac_url` foi informado, itera a partir do harvest
(`_iter_from_harvest`).
- Se `article_source_status_list` foi informado, itera a partir de
`ArticleSource` (`_iter_from_article_source`).
- Se `data_status_list` foi informado, itera a partir de `Article`
(`_iter_from_article`).
- Se `proc_status_list` foi informado, itera a partir de
`PidProviderXML` (`_iter_from_pid_provider`).
Caso nenhum desses argumentos "exclusivos" seja informado, o
comportamento padrão é iterar apenas a partir de `PidProviderXML`.
"""
any_selected = False
# Harvest somente quando explicitamente solicitado (por exemplo, via opac_url)
if self.opac_url is not None:
any_selected = True
yield from self._iter_from_harvest()
# ArticleSource controlado por article_source_status_list
if self.article_source_status_list is not None:
any_selected = True
yield from self._iter_from_article_source()
# Article controlado por data_status_list
if self.data_status_list is not None:
any_selected = True
yield from self._iter_from_article()
# PidProviderXML controlado por proc_status_list
if self.proc_status_list is not None:
any_selected = True
yield from self._iter_from_pid_provider()
# Padrão: apenas PidProviderXML quando nenhum argumento exclusivo foi informado
if not any_selected:
yield from self._iter_from_pid_provider()

Copilot uses AI. Check for mistakes.
logging.info(f"Iterators summary: harvest={self._iter_from_harvest_count}, "
f"article_source={self._iter_from_article_source_count}, "
f"pid_provider={self._iter_from_pid_provider_count}, "
f"article={self._iter_from_article_count}")

# ------------------------------------------------------------------
# Iteradores de seleção
# ------------------------------------------------------------------

def _iter_from_pid_provider(self):
"""Itera PidProviderXML filtrados por periódico, data e status."""
journal_issn_groups = (
Journal.get_journal_issns(self.collection_acron_list, self.journal_acron_list)
or [None]
)
for journal_issns in journal_issn_groups:
issn_list = [i for i in journal_issns if i] if journal_issns else None
if journal_issns and not issn_list:
continue
qs = PidProviderXML.get_queryset(
issn_list=issn_list,
from_pub_year=self.from_pub_year,
until_pub_year=self.until_pub_year,
from_updated_date=self.from_date,
until_updated_date=self.until_date,
proc_status_list=self.proc_status_list or [PPXML_STATUS_TODO, PPXML_STATUS_INVALID],
)
self._iter_from_pid_provider_count += qs.count()
for item in qs.iterator():
yield {"pp_xml_id": item.id}
logging.info(f"_iter_from_pid_provider: yielded {self._iter_from_pid_provider_count} items")

def _iter_from_article(self):
"""
Itera Articles filtrados por data_status.
Yields None para artigos sem pp_xml recuperável (sinaliza skip).
"""
filters = {
"data_status__in": self.data_status_list or [
choices.DATA_STATUS_PENDING,
choices.DATA_STATUS_UNDEF,
choices.DATA_STATUS_INVALID,
]
}
journal_id_list = Journal.get_ids(
collection_acron_list=self.collection_acron_list,
journal_acron_list=self.journal_acron_list,
)
if journal_id_list:
filters["journal__in"] = journal_id_list
if self.from_pub_year:
filters["pub_year__gte"] = self.from_pub_year
if self.until_pub_year:
filters["pub_year__lte"] = self.until_pub_year
if self.from_date:
filters["updated__gte"] = self.from_date
if self.until_date:
filters["updated__lte"] = self.until_date

articles = Article.objects.filter(**filters)
self._iter_from_article_count += articles.count()
for article in articles.iterator():
Comment on lines +501 to +537
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both _iter_from_pid_provider() and _iter_from_article() call .count() on potentially large querysets (and then iterate them), causing extra full COUNT queries that can be expensive in production. Consider removing these .count() calls (or using a cheap counter while iterating) and logging only the iterated count.

Copilot uses AI. Check for mistakes.
if not article.pp_xml:
try:
article.pp_xml = PidProviderXML.get_by_pid_v3(pid_v3=article.pid_v3)
article.save(update_fields=["pp_xml"])
except Exception as e:
logging.error(f"pp_xml not found for article {article.id}: {e}")
yield None
continue
yield {"pp_xml_id": article.pp_xml.id}
logging.info(f"_iter_from_article: yielded {self._iter_from_article_count} articles")

def _iter_from_harvest(self):
"""Itera documentos coletados via OPAC ou ArticleMeta."""

if Collection.objects.count() == 0:
Collection.load(self.user)

count = 0
for collection_acron in self.collection_acron_list or list(Collection.get_acronyms()):
logging.info(collection_acron)
harvester = self._build_harvester(collection_acron)
logging.info(harvester)
for document in harvester.harvest_documents():
count += 1
yield {
"xml_url": document["url"],
"collection_acron": collection_acron,
"pid": document["pid_v2"],
"source_date": document.get("processing_date") or document.get("origin_date"),
}

self._iter_from_harvest_count = count
logging.info(f"Harvest iterator yielded {count} documents")

def _iter_from_article_source(self):
"""Itera ArticleSources pendentes ou com erro."""
count = 0
for article_source in ArticleSource.get_queryset_to_complete_data(
self.from_date,
self.until_date,
self.force_update,
self.article_source_status_list,
):
count += 1
yield {"article_source_id": article_source.id}
self._iter_from_article_source_count += count
logging.info(f"ArticleSource iterator yielded {count} items")

# ------------------------------------------------------------------
# Helpers privados
# ------------------------------------------------------------------

def _build_harvester(self, collection_acron):
"""Instancia o harvester adequado para a coleção."""
kwargs = dict(
from_date=self.from_date,
until_date=self.until_date,
limit=self.limit,
timeout=self.timeout,
)
if collection_acron == "scl":
return OPACHarvester(self.opac_url or "www.scielo.br", collection_acron, **kwargs)
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArticleIteratorBuilder._build_harvester() passes "www.scielo.br" to OPACHarvester, but OPACHarvester now expects domain to already include the scheme (it concatenates with f"{self.domain}/..."). This will generate invalid URLs like www.scielo.br/api/.... Pass a fully qualified base URL (e.g., https://www.scielo.br) or normalize inside OPACHarvester.

Suggested change
return OPACHarvester(self.opac_url or "www.scielo.br", collection_acron, **kwargs)
domain = self.opac_url or "https://www.scielo.br"
if not domain.startswith(("http://", "https://")):
domain = f"https://{domain}"
return OPACHarvester(domain, collection_acron, **kwargs)

Copilot uses AI. Check for mistakes.
return AMHarvester("article", collection_acron, **kwargs)

32 changes: 32 additions & 0 deletions article/migrations/0048_alter_articlesource_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Generated by Django 5.2.7 on 2026-03-01 15:15

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("article", "0047_articleaffiliation_normalized_and_more"),
]

operations = [
migrations.AlterField(
model_name="articlesource",
name="status",
field=models.CharField(
choices=[
("pending", "Pending"),
("processing", "Processing"),
("completed", "Completed"),
("error", "Error"),
("reprocess", "Reprocess"),
("url_error", "URL Error"),
("xml_error", "XML Error"),
],
default="pending",
help_text="Processing status of the article source",
max_length=20,
verbose_name="Status",
),
),
]
Loading
Loading