Commit 025e0cd3 authored by David Read's avatar David Read
Browse files

Add HTML report

parent 25e7e5ea
......@@ -830,3 +830,74 @@ pdf_datasets_report_info = {
'generate': pdf_datasets_report,
'template': 'report/pdf_datasets_report.html',
# Datasets with HTML link
def html_datasets_report():
Returns datasets that only have an HTML link, by organization.
# Get packages
pkgs = model.Session.query(model.Package)\
# See if HTML
num_datasets_published = 0
num_datasets_only_html = 0
datasets_by_publisher_only_html = collections.defaultdict(list)
# use yield_per, otherwise memory use just goes up til the script is killed
# by the os.
for pkg in pkgs.yield_per(100):
if p.toolkit.asbool(pkg.extras.get('unpublished')):
num_datasets_published += 1
formats = set([res.format.lower() for res in pkg.resources
if res.resource_type != 'documentation'])
if 'html' not in formats:
org = pkg.get_organization().name
data_formats = formats - set(('asp', '', None))
if data_formats == set(('html',)):
num_datasets_only_html += 1
datasets_by_publisher_only_html[org].append((, pkg.title))
rows = []
for org_name, datasets_only_html in sorted(
key=lambda x: -len(x[1])):
org = model.Session.query(model.Group) \
.filter_by(name=org_name) \
top_org = list(go_up_tree(org))[-1]
row = OrderedDict((
('organization title', org.title),
('organization name',,
('top-level organization title', top_org.title),
('top-level organization name',,
('num datasets only html', len(datasets_only_html)),
('name datasets only html',
' '.join(d[0] for d in datasets_only_html)),
('title datasets only html',
'|'.join(d[1] for d in datasets_only_html)),
return {'table': rows,
'num_datasets_published': num_datasets_published,
'num_datasets_only_html': num_datasets_only_html,
html_datasets_report_info = {
'name': 'html_datasets',
'title': 'HTML Datasets',
'description': 'Datasets with data only a link to an HTML page.',
'option_defaults': None,
'option_combinations': None,
'generate': html_datasets_report,
'template': 'report/html_datasets_report.html',
......@@ -379,6 +379,7 @@ class PublisherPlugin(p.SingletonPlugin):
<div class="widget-container">
<li>Datasets only with an HTML link: {{data['num_datasets_only_html']}} / {{data['num_datasets_published']}} ({{ '{:.0%}'.format(data['num_datasets_only_html']/data['num_datasets_published']) }})</li>
<li>Publishers with datasets only with an HTML link: {{table|length}}</li>
<p>NB The full lists of datasets are available in the CSV &amp; JSON downloads</p>
<table class="table table-bordered table-condensed tablesorter" id="report-table" style="width: 100%;table-layout:fixed; margin-top: 8px;">
<th style="width: 80px" rowspan="2">Publisher</th>
<th style="width: 80px" rowspan="2">Top-level Publisher</th>
<th style="width: 300px" rowspan="2">Datasets only with a HTML link</th>
{% for row in table %}
<td><a href="/publisher/{{row['organization name']}}">{{row['organization title']}}</a></td>
<td><a href="/publisher/{{row['top-level organization name']}}">{{row['top-level organization title']}}</a></td>
{% set names = row['name datasets only html'].split() %}
{% set titles = row['title datasets only html'].split('|') %}
{% set truncate = 5 %}
{{row['num datasets only html']}}:
{% for name in names[:truncate] %}
<a href="{{ h.url_for(controller='package', action='read', id=name) }}">{{ titles[loop.index0] }}</a> &nbsp;
{% endfor %}
{% if names|length > truncate %}
{% endif %}
{% endfor %}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment