Commit 05a8bed6 authored by David Read's avatar David Read
Browse files

Merge branch 'master' of github.com:ckan/ckanext-archiver

parents 84129c2d f7901e68
......@@ -132,6 +132,16 @@ NB Previously you needed both ckanext-archiver and ckanext-qa to see the broken
python ckanext/archiver/bin/migrate_task_status.py --write production.ini
Migrations post 2.0
-------------------
Over time it is possible that the database structure will change. In these cases you can use the migrate command to update the database schema.
::
paster --plugin=ckanext-archiver archiver migrate -c <path to CKAN ini file>
This is only necessary if you update ckanext-archiver and already have the database tables in place.
Installing a Celery queue backend
---------------------------------
......@@ -324,6 +334,20 @@ To run the tests:
(pyenv)~/pyenv/src/ckan$ nosetests --ckan ../ckanext-archiver/tests/ --with-pylons=../ckanext-archiver/test-core.ini
Building Debian package
-----------------------
NB this attempt at creating a Debian package is experimental. Important package dependencies have yet to specified. The outstanding issue is that some dependencies do not exist as debian packages (eg: messytables).
To build the debian package::
cd ckanext-archiver; dpkg-buildpackage -us -uc -i -I -rfakeroot
To list the package contents::
dpkg --contents ../python-ckanext-archiver_0.1-1_all.deb
Questions
---------
......
......@@ -10,6 +10,7 @@ import ckan.plugins as p
from pylons import config
from ckan.lib.cli import CkanCommand
from ckan.lib.helpers import OrderedDict
REQUESTS_HEADER = {'content-type': 'application/json'}
......@@ -61,6 +62,10 @@ class Archiver(CkanCommand):
{2-chars-of-resource-id}/{resource-id}/filename.csv
Running this moves them to the new locations and updates the
cache_url on each resource to reflect the new location.
paster archiver migrate
- Updates the database schema to include new fields.
'''
# TODO
# paster archiver clean-files
......@@ -120,6 +125,8 @@ class Archiver(CkanCommand):
self.log.info('Archiver tables are initialized')
elif cmd == 'migrate-archive-dirs':
self.migrate_archive_dirs()
elif cmd == 'migrate':
self.migrate()
else:
self.log.error('Command %s not recognized' % (cmd,))
......@@ -364,6 +371,46 @@ class Archiver(CkanCommand):
print " No cache_filepath: {0}".format(not_cached_deleted)
print " cache_filepath not on disk: {0}".format(file_not_found_deleted)
def migrate(self):
""" Adds any missing columns to the database table for Archival by
checking the schema and adding those that are missing.
If you wish to add a column, add the column name and sql
statement to MIGRATIONS_ADD which will check that the column is
not present before running the query.
If you wish to modify or delete a column, add the column name and
query to the MIGRATIONS_MODIFY which only runs if the column
does exist.
"""
from ckan import model
MIGRATIONS_ADD = OrderedDict({
"etag": "ALTER TABLE archival ADD COLUMN etag character varying",
"last_modified": "ALTER TABLE archival ADD COLUMN last_modified character varying"
})
MIGRATIONS_MODIFY = OrderedDict({
})
q = "select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = 'archival';"
current_cols = list([m[0] for m in model.Session.execute(q)])
for k, v in MIGRATIONS_ADD.iteritems():
if not k in current_cols:
self.log.info(u"Adding column '{0}'".format(k))
self.log.info(u"Executing '{0}'".format(v))
model.Session.execute(v)
model.Session.commit()
for k, v in MIGRATIONS_MODIFY.iteritems():
if k in current_cols:
self.log.info(u"Removing column '{0}'".format(k))
self.log.info(u"Executing '{0}'".format(v))
model.Session.execute(v)
model.Session.commit()
self.log.info("Migrations complete")
def migrate_archive_dirs(self):
from ckan import model
from ckan.logic import get_action
......
......@@ -31,6 +31,7 @@ class Status:
not_broken = {
# is_broken = False
0: 'Archived successfully',
1: 'Content has not changed',
}
broken = {
# is_broken = True
......@@ -74,7 +75,7 @@ class Status:
@classmethod
def is_ok(cls, status_id):
return status_id == 0
return status_id in [0, 1]
broken_enum = {True: 'Broken',
None: 'Not sure if broken',
......@@ -105,6 +106,8 @@ class Archival(Base):
size = Column(types.BigInteger, default=0)
mimetype = Column(types.UnicodeText)
hash = Column(types.UnicodeText)
etag = Column(types.UnicodeText)
last_modified = Column(types.UnicodeText)
# History
first_failure = Column(types.DateTime)
......
......@@ -53,16 +53,18 @@ class ArchiverPlugin(p.SingletonPlugin, p.toolkit.DefaultDatasetForm):
# IActions
def get_actions(self):
return dict((name, function) for name, function
in action.__dict__.items()
if callable(function))
return {
'archiver_resource_show': action.archiver_resource_show,
'archiver_dataset_show': action.archiver_dataset_show,
}
# IAuthFunctions
def get_auth_functions(self):
return dict((name, function) for name, function
in auth.__dict__.items()
if callable(function))
return {
'archiver_resource_show': auth.archiver_resource_show,
'archiver_dataset_show': auth.archiver_dataset_show,
}
# ITemplateHelpers
......
......@@ -67,6 +67,8 @@ class ArchiveError(ArchiverErrorAfterDownloadStarted):
pass
class ChooseNotToDownload(ArchiverErrorAfterDownloadStarted):
pass
class NotChanged(ArchiverErrorAfterDownloadStarted):
pass
class LinkCheckerError(ArchiverError):
pass
class LinkInvalidError(LinkCheckerError):
......@@ -119,6 +121,7 @@ def update_package(ckan_ini_filepath, package_id, queue='bulk'):
log = update_package.get_logger()
log.info('Starting update_package task: package_id=%r queue=%s', package_id, queue)
num_archived = 0
# Do all work in a sub-routine since it can then be tested without celery.
# Also put try/except around it is easier to monitor ckan's log rather than
# celery's task status.
......@@ -128,7 +131,9 @@ def update_package(ckan_ini_filepath, package_id, queue='bulk'):
for resource in package['resources']:
resource_id = resource['id']
_update_resource(ckan_ini_filepath, resource_id, queue)
res = _update_resource(ckan_ini_filepath, resource_id, queue)
if res:
num_archived += 1
except Exception, e:
if os.environ.get('DEBUG'):
raise
......@@ -137,7 +142,11 @@ def update_package(ckan_ini_filepath, package_id, queue='bulk'):
e, package_id, package['name'] if 'package' in dir() else '')
raise
notify_package(package, queue, ckan_ini_filepath)
if num_archived > 0:
log.info("Notifying package as %d items were archived", num_archived)
notify_package(package, queue, ckan_ini_filepath)
else:
log.info("Not notifying package as 0 items were archived")
# Refresh the index for this dataset, so that it contains the latest
# archive info. However skip it if there are downstream plugins that will
......@@ -216,16 +225,24 @@ def _update_resource(ckan_ini_filepath, resource_id, queue):
archive_result.get('cache_filename') if archive_result else None)
# Download
try_as_api = False
requires_archive = True
log.info("Attempting to download resource: %s" % resource['url'])
download_result = None
from ckanext.archiver.model import Status
from ckanext.archiver.model import Status, Archival
download_status_id = Status.by_text('Archived successfully')
context = {
'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'],
'cache_url_root': config.get('ckanext-archiver.cache_url_root'),
'previous': Archival.get_for_resource(resource_id)
}
try:
download_result = download(context, resource)
except NotChanged, e:
download_status_id = Status.by_text('Content has not changed')
try_as_api = False
requires_archive = False
except LinkInvalidError, e:
download_status_id = Status.by_text('URL invalid')
try_as_api = False
......@@ -262,6 +279,10 @@ def _update_resource(ckan_ini_filepath, resource_id, queue):
_save(download_status_id, e, resource, *extra_args)
return
if not requires_archive:
# We don't need to archive if the remote content has not changed
return None
# Archival
log.info('Attempting to archive resource')
try:
......@@ -273,7 +294,8 @@ def _update_resource(ckan_ini_filepath, resource_id, queue):
# Success
_save(Status.by_text('Archived successfully'), '', resource,
download_result['url_redirected_to'], download_result, archive_result)
download_result['url_redirected_to'], download_result, archive_result)
# The return value is only used by tests. Serialized for Celery.
return json.dumps(dict(download_result, **archive_result))
......@@ -320,6 +342,12 @@ def download(context, resource, url_timeout=30,
res = requests_wrapper(log, method_func, url, timeout=url_timeout,
stream=True, headers=headers)
url_redirected_to = res.url if url != res.url else None
if context.get('previous') and ('etag' in res.headers):
if context.get('previous').etag == res.headers['etag']:
log.info("ETAG matches, not downloading content")
raise NotChanged("etag suggests content has not changed")
if not res.ok: # i.e. 404 or something
raise DownloadError('Server reported status error: %s %s' %
(res.status_code, res.reason),
......@@ -606,6 +634,8 @@ def save_archival(resource, status_id, reason, url_redirected_to,
archival.size = download_result['size']
archival.mimetype = download_result['mimetype']
archival.hash = download_result['hash']
archival.etag = download_result['headers'].get('etag')
archival.last_modified = download_result['headers'].get('last-modified')
# History
if archival.is_broken is False:
......
......@@ -10,10 +10,10 @@
<col style="width: 10%" />
<col style="width: 10%" />
<tr>
<th>Organization</th>
<th>Datasets with broken links</th>
<th>Broken links</th>
<th>% Broken links</th>
<th class="header">Organization</th>
<th class="header">Datasets with broken links</th>
<th class="header">Broken links</th>
<th class="header">% Broken links</th>
</tr>
</thead>
<tbody>
......@@ -27,9 +27,9 @@
{% endfor %}
</tbody>
</table>
{% endif %}
{% if c.options['organization'] != None %}
{% else %}
<ul>
<li>Broken datasets: {{ c.data['num_broken_packages'] }} / {{ c.data['num_packages'] }} ({{ c.data.get('broken_package_percent') }}%)</li>
<li>Broken links: {{ c.data['num_broken_resources'] }} / {{ c.data['num_resources'] }} ({{ c.data.get('broken_resource_percent') }}%)</li>
......
ckanext-archiver (0.1-1) unstable; urgency=low
* source package automatically created by stdeb 0.5.1
-- Open Knowledge Foundation <info@okfn.org> Wed, 09 May 2012 07:44:10 -0700
Source: ckanext-archiver
Maintainer: Open Knowledge Foundation <info@okfn.org>
Section: python
Priority: optional
Build-Depends: python-setuptools (>= 0.6b3), debhelper (>= 7), python-support (>= 0.8.4)
Standards-Version: 3.7.2
Package: python-ckanext-archiver
Architecture: all
Depends: ${python:Depends}
XB-Python-Version: ${python:Versions}
Provides: ${python:Provides}
Description: Archive ckan resources
Archive ckan resources
python-ckanext-archiver_0.1-1_all.deb python optional
dh_auto_configure
dh_auto_build
dh_auto_test
dh_prep
dh_installdirs
dh_auto_install
dh_install
dh_installdocs
dh_installchangelogs
dh_installexamples
dh_installman
dh_installcatalogs
dh_installcron
dh_installdebconf
dh_installemacsen
dh_installifupdown
dh_installinfo
dh_pysupport
dh_installinit
dh_installmenu
dh_installmime
dh_installmodules
dh_installlogcheck
dh_installlogrotate
dh_installpam
dh_installppp
dh_installudev
dh_installwm
dh_installxfonts
dh_bugfiles
dh_lintian
dh_gconf
dh_icons
dh_perl
dh_usrlocal
dh_link
dh_compress
dh_fixperms
dh_strip
dh_makeshlibs
dh_shlibdeps
dh_installdeb
dh_gencontrol
dh_md5sums
dh_builddeb
# Automatically added by dh_pysupport
if which update-python-modules >/dev/null 2>&1; then
update-python-modules python-ckanext-archiver.public
fi
# End automatically added section
#! /bin/sh
set -e
# This was added by stdeb to workaround Debian #479852. In a nutshell,
# pycentral does not remove normally remove its symlinks on an
# upgrade. Since we're using python-support, however, those symlinks
# will be broken. This tells python-central to clean up any symlinks.
if [ -e /var/lib/dpkg/info/python-ckanext-archiver.list ] && which pycentral >/dev/null 2>&1
then
pycentral pkgremove python-ckanext-archiver
fi
#DEBHELPER#
# Automatically added by dh_pysupport
if which update-python-modules >/dev/null 2>&1; then
update-python-modules -c python-ckanext-archiver.public
fi
# End automatically added section
python:Versions=2.6
python:Provides=python2.6-ckanext-archiver
python:Depends=python-support (>= 0.90.0)
misc:Depends=
#!/usr/bin/make -f
# This file was automatically generated by stdeb 0.5.1 at
# Wed, 09 May 2012 07:44:10 -0700
# Unset the environment variables set by dpkg-buildpackage. (This is
# necessary because distutils is brittle with compiler/linker flags
# set. Specifically, packages using f2py will break without this.)
unexport CPPFLAGS
unexport CFLAGS
unexport CXXFLAGS
unexport FFLAGS
unexport LDFLAGS
#exports specified using stdeb Setup-Env-Vars:
export DH_OPTIONS=--buildsystem=python_distutils
%:
dh $@
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment