Commit c1e42065 authored by David Read's avatar David Read
Browse files

Merge pull request #28 from ckan/only-archive-on-url-change

Only archive if a resource URL changes
parents 5bb00c04 223ed0aa
...@@ -2,7 +2,6 @@ import os ...@@ -2,7 +2,6 @@ import os
import logging import logging
import ckan.plugins as p import ckan.plugins as p
from ckan import model
from ckan.model.types import make_uuid from ckan.model.types import make_uuid
from ckan.lib.celery_app import celery from ckan.lib.celery_app import celery
...@@ -34,3 +33,10 @@ def create_archiver_package_task(package, queue): ...@@ -34,3 +33,10 @@ def create_archiver_package_task(package, queue):
task_id=task_id, queue=queue) task_id=task_id, queue=queue)
log.debug('Archival of package put into celery queue %s: %s', log.debug('Archival of package put into celery queue %s: %s',
queue, queue,
def get_extra_from_pkg_dict(pkg_dict, key, default=None):
for extra in pkg_dict['extras']:
if extra['key'] == key:
return extra['value']
return default
...@@ -33,10 +33,109 @@ class ArchiverPlugin(p.SingletonPlugin, p.toolkit.DefaultDatasetForm): ...@@ -33,10 +33,109 @@ class ArchiverPlugin(p.SingletonPlugin, p.toolkit.DefaultDatasetForm):
if not isinstance(entity, model.Package): if not isinstance(entity, model.Package):
return return
log.debug('Notified of package event: %s %s',, operation) log.debug('Notified of package event: %s %s',, operation)
run_archiver = \
self._is_it_sufficient_change_to_run_archiver(entity, operation)
if not run_archiver:
log.debug('Creating archiver task: %s',
lib.create_archiver_package_task(entity, 'priority') lib.create_archiver_package_task(entity, 'priority')
def _is_it_sufficient_change_to_run_archiver(self, package, operation):
''' Returns True if in this revision any of these happened:
* it is a new dataset
* dataset licence changed (affects qa)
* there are resources that have been added or deleted
* resources have changed their URL or format (affects qa)
if operation == 'new':
log.debug('New package - will archive')
# even if it has no resources, QA needs to show 0 stars against it
return True
elif operation == 'deleted':
log.debug('Deleted package - won\'t archive')
return False
# therefore operation=changed
# check to see if resources are added, deleted or URL changed
# look for the latest revision
rev_list = package.all_related_revisions
if not rev_list:
log.debug('No sign of previous revisions - will archive')
return True
# I am not confident we can rely on the info about the current
# revision, because we are still in the 'before_commit' stage. So
# simply ignore that if it's returned.
if rev_list[0][0].id ==
rev_list = rev_list[1:]
if not rev_list:
log.warn('No sign of previous revisions - will archive')
return True
previous_revision = rev_list[0][0]
log.debug('Comparing with revision: %s %s',
# get the package as it was at that previous revision
context = {'model': model, 'session': model.Session,
#'user': c.user or,
'ignore_auth': True,
data_dict = {'id':}
old_pkg_dict = p.toolkit.get_action('package_show')(
context, data_dict)
except p.toolkit.NotFound:
log.warn('No sign of previous package - will archive anyway')
return True
# has the licence changed?
old_licence = (old_pkg_dict['license_id'],
lib.get_extra_from_pkg_dict(old_pkg_dict, 'licence')
or None)
new_licence = (package.license_id,
package.extras.get('licence') or None)
if old_licence != new_licence:
log.debug('Licence has changed - will archive: %r->%r',
old_licence, new_licence)
return True
# have any resources been added or deleted?
old_resources = dict((res['id'], res)
for res in old_pkg_dict['resources'])
old_res_ids = set(old_resources.keys())
new_res_ids = set(( for res in package.resources))
deleted_res_ids = old_res_ids - new_res_ids
if deleted_res_ids:
log.debug('Deleted resources - will archive. res_ids=%r',
return True
added_res_ids = new_res_ids - old_res_ids
if added_res_ids:
log.debug('Added resources - will archive. res_ids=%r',
return True
# have any resources' url/format changed?
for res in package.resources:
for key in ('url', 'format'):
old_res_value = old_resources[][key]
new_res_value = getattr(res, key)
if old_res_value != new_res_value:
log.debug('Resource %s changed - will archive. '
'id=%s pos=%s url="%s"->"%s"',
key,[:4], res.position,
old_res_value, new_res_value)
return True
log.debug('Resource unchanged. pos=%s id=%s',
log.debug('No new, deleted or changed resources - won\'t archive')
return False
# IReport # IReport
def register_reports(self): def register_reports(self):
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment