Commit 25e7e5ea authored by David Read's avatar David Read
Browse files

PDF datasets

parent 3d32ec4f
......@@ -752,10 +752,81 @@ def licence_combinations():
licence_report_info = {
'name': 'licence',
'title': 'Licences',
'description': 'Licenses for datasets, reported by publisher.',
'description': 'Licenses for datasets.',
'option_defaults': OrderedDict((('organization', None),
('include_sub_organizations', False))),
'option_combinations': licence_combinations,
'generate': licence_report,
'template': 'report/licence_report.html',
# Datasets only in PDF
def pdf_datasets_report():
Returns datasets that have data in PDF format, by organization.
# Get packages
pkgs = model.Session.query(model.Package)\
# See if PDF
num_datasets_published = 0
num_datasets_only_pdf = 0
datasets_by_publisher_only_pdf = collections.defaultdict(list)
# use yield_per, otherwise memory use just goes up til the script is killed
# by the os.
for pkg in pkgs.yield_per(100):
if p.toolkit.asbool(pkg.extras.get('unpublished')):
num_datasets_published += 1
formats = set([res.format.lower() for res in pkg.resources
if res.resource_type != 'documentation'])
if 'pdf' not in formats:
org = pkg.get_organization().name
data_formats = formats - set(('html', '', None))
if data_formats == set(('pdf',)):
num_datasets_only_pdf += 1
datasets_by_publisher_only_pdf[org].append((, pkg.title))
rows = []
for org_name, datasets_only_pdf in sorted(
key=lambda x: -len(x[1])):
org = model.Session.query(model.Group) \
.filter_by(name=org_name) \
top_org = list(go_up_tree(org))[-1]
row = OrderedDict((
('organization title', org.title),
('organization name',,
('top-level organization title', top_org.title),
('top-level organization name',,
('num datasets only pdf', len(datasets_only_pdf)),
('name datasets only pdf',
' '.join(d[0] for d in datasets_only_pdf)),
('title datasets only pdf',
'|'.join(d[1] for d in datasets_only_pdf)),
return {'table': rows,
'num_datasets_published': num_datasets_published,
'num_datasets_only_pdf': num_datasets_only_pdf,
pdf_datasets_report_info = {
'name': 'pdf_datasets',
'title': 'PDF Datasets',
'description': 'Datasets with data only in PDF format.',
'option_defaults': None,
'option_combinations': None,
'generate': pdf_datasets_report,
'template': 'report/pdf_datasets_report.html',
......@@ -378,6 +378,7 @@ class PublisherPlugin(p.SingletonPlugin):
<div class="widget-container">
<li>Datasets only in PDF: {{data['num_datasets_only_pdf']}} / {{data['num_datasets_published']}} ({{ '{:.0%}'.format(data['num_datasets_only_pdf']/data['num_datasets_published']) }})</li>
<li>Publishers with datasets only in PDF: {{table|length}}</li>
<p>NB The full lists of datasets are available in the CSV &amp; JSON downloads</p>
<table class="table table-bordered table-condensed tablesorter" id="report-table" style="width: 100%;table-layout:fixed; margin-top: 8px;">
<th style="width: 80px" rowspan="2">Publisher</th>
<th style="width: 80px" rowspan="2">Top-level Publisher</th>
<th style="width: 300px" rowspan="2">Datasets only in PDF</th>
{% for row in table %}
<td><a href="/publisher/{{row['organization name']}}">{{row['organization title']}}</a></td>
<td><a href="/publisher/{{row['top-level organization name']}}">{{row['top-level organization title']}}</a></td>
{% set names = row['name datasets only pdf'].split() %}
{% set titles = row['title datasets only pdf'].split('|') %}
{% set truncate = 5 %}
{{row['num datasets only pdf']}}:
{% for name in names[:truncate] %}
<a href="{{ h.url_for(controller='package', action='read', id=name) }}">{{ titles[loop.index0] }}</a> &nbsp;
{% endfor %}
{% if names|length > truncate %}
{% endif %}
{% endfor %}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment