Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MJU-POPS
ckanext-archiver
Commits
e1b7b132
Commit
e1b7b132
authored
Jan 27, 2016
by
David Read
Browse files
Merge pull request #19 from datagovuk/master
ETag checking
parents
7fde1b43
ef082bfa
Changes
6
Show whitespace changes
Inline
Side-by-side
README.rst
View file @
e1b7b132
...
...
@@ -132,6 +132,16 @@ NB Previously you needed both ckanext-archiver and ckanext-qa to see the broken
python ckanext/archiver/bin/migrate_task_status.py --write production.ini
Migrations post 2.0
-------------------
Over time it is possible that the database structure will change. In these cases you can use the migrate command to update the database schema.
::
paster --plugin=ckanext-archiver archiver migrate -c <path to CKAN ini file>
This is only necessary if you update ckanext-archiver and already have the database tables in place.
Installing a Celery queue backend
---------------------------------
...
...
ckanext/archiver/commands.py
View file @
e1b7b132
...
...
@@ -10,6 +10,7 @@ import ckan.plugins as p
from
pylons
import
config
from
ckan.lib.cli
import
CkanCommand
from
ckan.lib.helpers
import
OrderedDict
REQUESTS_HEADER
=
{
'content-type'
:
'application/json'
}
...
...
@@ -61,6 +62,10 @@ class Archiver(CkanCommand):
{2-chars-of-resource-id}/{resource-id}/filename.csv
Running this moves them to the new locations and updates the
cache_url on each resource to reflect the new location.
paster archiver migrate
- Updates the database schema to include new fields.
'''
# TODO
# paster archiver clean-files
...
...
@@ -120,6 +125,8 @@ class Archiver(CkanCommand):
self
.
log
.
info
(
'Archiver tables are initialized'
)
elif
cmd
==
'migrate-archive-dirs'
:
self
.
migrate_archive_dirs
()
elif
cmd
==
'migrate'
:
self
.
migrate
()
else
:
self
.
log
.
error
(
'Command %s not recognized'
%
(
cmd
,))
...
...
@@ -364,6 +371,46 @@ class Archiver(CkanCommand):
print
" No cache_filepath: {0}"
.
format
(
not_cached_deleted
)
print
" cache_filepath not on disk: {0}"
.
format
(
file_not_found_deleted
)
def
migrate
(
self
):
""" Adds any missing columns to the database table for Archival by
checking the schema and adding those that are missing.
If you wish to add a column, add the column name and sql
statement to MIGRATIONS_ADD which will check that the column is
not present before running the query.
If you wish to modify or delete a column, add the column name and
query to the MIGRATIONS_MODIFY which only runs if the column
does exist.
"""
from
ckan
import
model
MIGRATIONS_ADD
=
OrderedDict
({
"etag"
:
"ALTER TABLE archival ADD COLUMN etag character varying"
,
"last_modified"
:
"ALTER TABLE archival ADD COLUMN last_modified character varying"
})
MIGRATIONS_MODIFY
=
OrderedDict
({
})
q
=
"select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = 'archival';"
current_cols
=
list
([
m
[
0
]
for
m
in
model
.
Session
.
execute
(
q
)])
for
k
,
v
in
MIGRATIONS_ADD
.
iteritems
():
if
not
k
in
current_cols
:
self
.
log
.
info
(
u
"Adding column '{0}'"
.
format
(
k
))
self
.
log
.
info
(
u
"Executing '{0}'"
.
format
(
v
))
model
.
Session
.
execute
(
v
)
model
.
Session
.
commit
()
for
k
,
v
in
MIGRATIONS_MODIFY
.
iteritems
():
if
k
in
current_cols
:
self
.
log
.
info
(
u
"Removing column '{0}'"
.
format
(
k
))
self
.
log
.
info
(
u
"Executing '{0}'"
.
format
(
v
))
model
.
Session
.
execute
(
v
)
model
.
Session
.
commit
()
self
.
log
.
info
(
"Migrations complete"
)
def
migrate_archive_dirs
(
self
):
from
ckan
import
model
from
ckan.logic
import
get_action
...
...
ckanext/archiver/model.py
View file @
e1b7b132
...
...
@@ -31,6 +31,7 @@ class Status:
not_broken
=
{
# is_broken = False
0
:
'Archived successfully'
,
1
:
'Content has not changed'
,
}
broken
=
{
# is_broken = True
...
...
@@ -74,7 +75,7 @@ class Status:
@
classmethod
def
is_ok
(
cls
,
status_id
):
return
status_id
==
0
return
status_id
in
[
0
,
1
]
broken_enum
=
{
True
:
'Broken'
,
None
:
'Not sure if broken'
,
...
...
@@ -105,6 +106,8 @@ class Archival(Base):
size
=
Column
(
types
.
BigInteger
,
default
=
0
)
mimetype
=
Column
(
types
.
UnicodeText
)
hash
=
Column
(
types
.
UnicodeText
)
etag
=
Column
(
types
.
UnicodeText
)
last_modified
=
Column
(
types
.
UnicodeText
)
# History
first_failure
=
Column
(
types
.
DateTime
)
...
...
ckanext/archiver/plugin.py
View file @
e1b7b132
...
...
@@ -53,16 +53,18 @@ class ArchiverPlugin(p.SingletonPlugin, p.toolkit.DefaultDatasetForm):
# IActions
def
get_actions
(
self
):
return
dict
((
name
,
function
)
for
name
,
function
in
action
.
__dict__
.
items
()
if
callable
(
function
))
return
{
'archiver_resource_show'
:
action
.
archiver_resource_show
,
'archiver_dataset_show'
:
action
.
archiver_dataset_show
,
}
# IAuthFunctions
def
get_auth_functions
(
self
):
return
dict
((
name
,
function
)
for
name
,
function
in
auth
.
__dict__
.
items
()
if
callable
(
function
))
return
{
'archiver_resource_show'
:
auth
.
archiver_resource_show
,
'archiver_dataset_show'
:
auth
.
archiver_dataset_show
,
}
# ITemplateHelpers
...
...
ckanext/archiver/tasks.py
View file @
e1b7b132
...
...
@@ -67,6 +67,8 @@ class ArchiveError(ArchiverErrorAfterDownloadStarted):
pass
class
ChooseNotToDownload
(
ArchiverErrorAfterDownloadStarted
):
pass
class
NotChanged
(
ArchiverErrorAfterDownloadStarted
):
pass
class
LinkCheckerError
(
ArchiverError
):
pass
class
LinkInvalidError
(
LinkCheckerError
):
...
...
@@ -119,6 +121,7 @@ def update_package(ckan_ini_filepath, package_id, queue='bulk'):
log
=
update_package
.
get_logger
()
log
.
info
(
'Starting update_package task: package_id=%r queue=%s'
,
package_id
,
queue
)
num_archived
=
0
# Do all work in a sub-routine since it can then be tested without celery.
# Also put try/except around it is easier to monitor ckan's log rather than
# celery's task status.
...
...
@@ -128,7 +131,9 @@ def update_package(ckan_ini_filepath, package_id, queue='bulk'):
for
resource
in
package
[
'resources'
]:
resource_id
=
resource
[
'id'
]
_update_resource
(
ckan_ini_filepath
,
resource_id
,
queue
)
res
=
_update_resource
(
ckan_ini_filepath
,
resource_id
,
queue
)
if
res
:
num_archived
+=
1
except
Exception
,
e
:
if
os
.
environ
.
get
(
'DEBUG'
):
raise
...
...
@@ -137,7 +142,11 @@ def update_package(ckan_ini_filepath, package_id, queue='bulk'):
e
,
package_id
,
package
[
'name'
]
if
'package'
in
dir
()
else
''
)
raise
if
num_archived
>
0
:
log
.
info
(
"Notifying package as %d items were archived"
,
num_archived
)
notify_package
(
package
,
queue
,
ckan_ini_filepath
)
else
:
log
.
info
(
"Not notifying package as 0 items were archived"
)
# Refresh the index for this dataset, so that it contains the latest
# archive info. However skip it if there are downstream plugins that will
...
...
@@ -216,16 +225,24 @@ def _update_resource(ckan_ini_filepath, resource_id, queue):
archive_result
.
get
(
'cache_filename'
)
if
archive_result
else
None
)
# Download
try_as_api
=
False
requires_archive
=
True
log
.
info
(
"Attempting to download resource: %s"
%
resource
[
'url'
])
download_result
=
None
from
ckanext.archiver.model
import
Status
from
ckanext.archiver.model
import
Status
,
Archival
download_status_id
=
Status
.
by_text
(
'Archived successfully'
)
context
=
{
'site_url'
:
config
.
get
(
'ckan.site_url_internally'
)
or
config
[
'ckan.site_url'
],
'cache_url_root'
:
config
.
get
(
'ckanext-archiver.cache_url_root'
),
'previous'
:
Archival
.
get_for_resource
(
resource_id
)
}
try
:
download_result
=
download
(
context
,
resource
)
except
NotChanged
,
e
:
download_status_id
=
Status
.
by_text
(
'Content has not changed'
)
try_as_api
=
False
requires_archive
=
False
except
LinkInvalidError
,
e
:
download_status_id
=
Status
.
by_text
(
'URL invalid'
)
try_as_api
=
False
...
...
@@ -262,6 +279,10 @@ def _update_resource(ckan_ini_filepath, resource_id, queue):
_save
(
download_status_id
,
e
,
resource
,
*
extra_args
)
return
if
not
requires_archive
:
# We don't need to archive if the remote content has not changed
return
None
# Archival
log
.
info
(
'Attempting to archive resource'
)
try
:
...
...
@@ -274,6 +295,7 @@ def _update_resource(ckan_ini_filepath, resource_id, queue):
# Success
_save
(
Status
.
by_text
(
'Archived successfully'
),
''
,
resource
,
download_result
[
'url_redirected_to'
],
download_result
,
archive_result
)
# The return value is only used by tests. Serialized for Celery.
return
json
.
dumps
(
dict
(
download_result
,
**
archive_result
))
...
...
@@ -320,6 +342,12 @@ def download(context, resource, url_timeout=30,
res
=
requests_wrapper
(
log
,
method_func
,
url
,
timeout
=
url_timeout
,
stream
=
True
,
headers
=
headers
)
url_redirected_to
=
res
.
url
if
url
!=
res
.
url
else
None
if
context
.
get
(
'previous'
)
and
(
'etag'
in
res
.
headers
):
if
context
.
get
(
'previous'
).
etag
==
res
.
headers
[
'etag'
]:
log
.
info
(
"ETAG matches, not downloading content"
)
raise
NotChanged
(
"etag suggests content has not changed"
)
if
not
res
.
ok
:
# i.e. 404 or something
raise
DownloadError
(
'Server reported status error: %s %s'
%
(
res
.
status_code
,
res
.
reason
),
...
...
@@ -606,6 +634,8 @@ def save_archival(resource, status_id, reason, url_redirected_to,
archival
.
size
=
download_result
[
'size'
]
archival
.
mimetype
=
download_result
[
'mimetype'
]
archival
.
hash
=
download_result
[
'hash'
]
archival
.
etag
=
download_result
[
'headers'
].
get
(
'etag'
)
archival
.
last_modified
=
download_result
[
'headers'
].
get
(
'last-modified'
)
# History
if
archival
.
is_broken
is
False
:
...
...
ckanext/archiver/templates/report/broken_links.html
View file @
e1b7b132
...
...
@@ -27,9 +27,9 @@
{% endfor %}
</tbody>
</table>
{% endif %}
{% if c.options['organization'] != None %}
{% else %}
<ul>
<li>
Broken datasets: {{ c.data['num_broken_packages'] }} / {{ c.data['num_packages'] }} ({{ c.data.get('broken_package_percent') }}%)
</li>
<li>
Broken links: {{ c.data['num_broken_resources'] }} / {{ c.data['num_resources'] }} ({{ c.data.get('broken_resource_percent') }}%)
</li>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment