Source code for rspub.core.exe_changelist

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:samp:`Executors creating changelists`

Concrete classes:
    - :class:`NewChangeListExecutor`
    - :class:`IncrementalChangeListExecutor`

"""
import os
from abc import ABCMeta
from glob import glob
from resync import ChangeList
from resync import Resource
from resync import ResourceList
from resync.sitemap import Sitemap
from rspub.core.executors import Executor, SitemapData, ExecutorEvent
from rspub.core.rs_enum import Capability
from rspub.core.rs_paras import RsParameters


[docs]class ChangeListExecutor(Executor, metaclass=ABCMeta): """ :samp:`Abstract class for creating changelists` """
[docs] def generate_rs_documents(self, filenames: iter) -> [SitemapData]: pass
[docs] def __init__(self, rs_parameters: RsParameters=None): Executor.__init__(self, rs_parameters) # next parameters will all be set in the method update_previous_state self.previous_resources = None self.date_resourcelist_completed = None self.date_changelist_from = None self.resourcelist_files = [] self.changelist_files = []
##
[docs] def create_index(self, sitemap_data_iter: iter) -> SitemapData: changelist_index_path = self.para.abs_metadata_path("changelist-index.xml") changelist_index_uri = self.para.uri_from_path(changelist_index_path) if os.path.exists(changelist_index_path): os.remove(changelist_index_path) changelist_files = sorted(glob(self.para.abs_metadata_path("changelist_*.xml"))) if len(changelist_files) > 1: changelist_index = ChangeList() changelist_index.sitemapindex = True changelist_index.md_from = self.date_resourcelist_completed for cl_file in changelist_files: changelist = self.read_sitemap(cl_file, ChangeList()) uri = self.para.uri_from_path(cl_file) changelist_index.resources.append(Resource(uri=uri, md_from=changelist.md_from, md_until=changelist.md_until)) if self.para.is_saving_sitemaps: index_link = changelist.link("index") if index_link is None: changelist.link_set(rel="index", href=changelist_index_uri) self.save_sitemap(changelist, cl_file) self.finish_sitemap(-1, changelist_index)
[docs] def update_previous_state(self): if self.previous_resources is None: self.previous_resources = {} # search for resourcelists self.resourcelist_files = sorted(glob(self.para.abs_metadata_path("resourcelist_*.xml"))) for rl_file_name in self.resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) self.date_resourcelist_completed = resourcelist.md_completed if self.date_resourcelist_completed is None: self.date_resourcelist_completed = resourcelist.md_at self.previous_resources.update({resource.uri: resource for resource in resourcelist.resources}) # search for changelists self.changelist_files = sorted(glob(self.para.abs_metadata_path("changelist_*.xml"))) for cl_file_name in self.changelist_files: changelist = ChangeList() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changelist) for resource in changelist.resources: if resource.change == "created" or resource.change == "updated": self.previous_resources.update({resource.uri: resource}) elif resource.change == "deleted" and resource.uri in self.previous_resources: del self.previous_resources[resource.uri]
[docs] def changelist_generator(self, filenames: iter) -> iter: def generator(changelist=None) -> [SitemapData, ChangeList]: resource_generator = self.resource_generator() self.update_previous_state() prev_r = self.previous_resources curr_r = {resource.uri: resource for count, resource in resource_generator(filenames)} created = [r for r in curr_r.values() if r.uri not in prev_r] updated = [r for r in curr_r.values() if r.uri in prev_r and r.md5 != prev_r[r.uri].md5] deleted = [r for r in prev_r.values() if r.uri not in curr_r] unchang = [r for r in curr_r.values() if r.uri in prev_r and r.md5 == prev_r[r.uri].md5] # remove lastmod from deleted resource metadata for resource in deleted: resource.lastmod = None num_created = len(created) num_updated = len(updated) num_deleted = len(deleted) tot_changes = num_created + num_updated + num_deleted self.observers_inform(self, ExecutorEvent.found_changes, created=num_created, updated=num_updated, deleted=num_deleted, unchanged=len(unchang)) all_changes = {"created": created, "updated": updated, "deleted": deleted} ordinal = self.find_ordinal(Capability.changelist.name) resource_count = 0 if changelist: ordinal -= 1 resource_count = len(changelist) if resource_count >= self.para.max_items_in_list: changelist = None ordinal += 1 resource_count = 0 for kv in all_changes.items(): for resource in kv[1]: if changelist is None: changelist = ChangeList() changelist.md_from = self.date_changelist_from resource.change = kv[0] # type of change: created, updated or deleted resource.md_datetime = self.date_start_processing changelist.add(resource) resource_count += 1 # under conditions: yield the current changelist if resource_count % self.para.max_items_in_list == 0: ordinal += 1 sitemap_data = self.finish_sitemap(ordinal, changelist) yield sitemap_data, changelist changelist = None # under conditions: yield the current and last changelist if changelist and tot_changes > 0: ordinal += 1 sitemap_data = self.finish_sitemap(ordinal, changelist) yield sitemap_data, changelist return generator
[docs]class NewChangeListExecutor(ChangeListExecutor): """ :samp:`Implements the new changelist strategy` A :class:`NewChangeListExecutor` creates new changelists every time the executor runs (and is_saving_sitemaps). If there are previous changelists that are not closed (md:until is not set) this executor will close those previous changelists by setting their md:until value to now (start_of_processing) """
[docs] def generate_rs_documents(self, filenames: iter): self.update_previous_state() if len(self.changelist_files) == 0: self.date_changelist_from = self.date_resourcelist_completed else: self.date_changelist_from = self.date_start_processing sitemap_data_iter = [] generator = self.changelist_generator(filenames) for sitemap_data, changelist in generator(): sitemap_data_iter.append(sitemap_data) return sitemap_data_iter
[docs] def post_process_documents(self, sitemap_data_iter: iter): # change md:until value of older changelists - if we created new changelists. # self.changelist_files was globed before new documents were generated (self.update_previous_state). if len(sitemap_data_iter) > 0 and self.para.is_saving_sitemaps: for filename in self.changelist_files: changelist = self.read_sitemap(filename, ChangeList()) if changelist.md_until is None: changelist.md_until = self.date_start_processing self.save_sitemap(changelist, filename)
[docs]class IncrementalChangeListExecutor(ChangeListExecutor): """ :samp:`Implements the incremental changelist strategy` An :class:`IncrementalChangeListExecutor` adds changes to an already existing changelist every time the executor runs (and is_saving_sitemaps). """
[docs] def generate_rs_documents(self, filenames: iter): self.update_previous_state() self.date_changelist_from = self.date_resourcelist_completed changelist = None if len(self.changelist_files) > 0: changelist = self.read_sitemap(self.changelist_files[-1], ChangeList()) sitemap_data_iter = [] generator = self.changelist_generator(filenames) for sitemap_data, changelist in generator(changelist=changelist): sitemap_data_iter.append(sitemap_data) return sitemap_data_iter