reports.py 10.2 KB
Newer Older
1
2
3
import copy

import ckan.model as model
4
import ckan.plugins as p
5
6

from ckan.lib.helpers import OrderedDict
7
from ckanext.report import lib
8

9
def broken_links(organization, include_sub_organizations=False):
10
    if organization is None:
11
        return broken_links_index(include_sub_organizations=include_sub_organizations)
12
13
14
    else:
        return broken_links_for_organization(organization=organization, include_sub_organizations=include_sub_organizations)

15

16
def broken_links_index(include_sub_organizations=False):
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    '''Returns the count of broken links for all organizations.'''

    from ckanext.archiver.model import Archival

    counts = {}
    # Get all the broken datasets and build up the results by org
    for org in model.Session.query(model.Group)\
                          .filter(model.Group.type == 'organization')\
                          .filter(model.Group.state == 'active').all():
        archivals = model.Session.query(Archival)\
                         .filter(Archival.is_broken == True)\
                         .join(model.Package, Archival.package_id == model.Package.id)\
                         .filter(model.Package.owner_org == org.id)\
                         .filter(model.Package.state == 'active')\
                         .join(model.Resource, Archival.resource_id == model.Resource.id)\
                         .filter(model.Resource.state == 'active')
        broken_resources = archivals.count()
        broken_datasets = archivals.distinct(model.Package.id).count()
35
        num_datasets = model.Session.query(model.Package)\
36
37
38
            .filter_by(owner_org=org.id)\
            .filter_by(state='active')\
            .count()
39
        num_resources = model.Session.query(model.Package)\
40
41
            .filter_by(owner_org=org.id)\
            .filter_by(state='active')
42
        if p.toolkit.check_ckan_version(max_version='2.2.99'):
43
44
45
46
47
            num_resources = num_resources.join(model.ResourceGroup)
        num_resources = num_resources \
            .join(model.Resource)\
            .filter_by(state='active')\
            .count()
48
49
        counts[org.name] = {
            'organization_title': org.title,
50
51
52
53
            'broken_packages': broken_datasets,
            'broken_resources': broken_resources,
            'packages': num_datasets,
            'resources': num_resources
54
55
56
57
58
59
60
61
62
63
64
65
        }

    counts_with_sub_orgs = copy.deepcopy(counts)  # new dict
    if include_sub_organizations:
        for org_name in counts_with_sub_orgs:
            org = model.Group.by_name(org_name)

            for sub_org_id, sub_org_name, sub_org_title, sub_org_parent_id \
                    in org.get_children_group_hierarchy(type='organization'):
                if sub_org_name not in counts:
                    # occurs only if there is an organization created since the last loop?
                    continue
66
67
68
69
                counts_with_sub_orgs[org_name]['broken_packages'] += \
                        counts[sub_org_name]['broken_packages']
                counts_with_sub_orgs[org_name]['broken_resources'] += \
                        counts[sub_org_name]['broken_resources']
70
71
72
73
74
75
76
77
78
                counts_with_sub_orgs[org_name]['packages'] += \
                        counts[sub_org_name]['packages']
                counts_with_sub_orgs[org_name]['resources'] += \
                        counts[sub_org_name]['resources']
        results = counts_with_sub_orgs
    else:
        results = counts

    data = []
79
80
    num_broken_packages = 0
    num_broken_resources = 0
81
82
    num_packages = 0
    num_resources = 0
83
84
85
86
    for org_name, org_counts in sorted(results.iteritems(), key=lambda r: r[0]):
        data.append(OrderedDict((
            ('organization_title', results[org_name]['organization_title']),
            ('organization_name', org_name),
87
88
89
90
91
92
            ('package_count', org_counts['packages']),
            ('resource_count', org_counts['resources']),
            ('broken_package_count', org_counts['broken_packages']),
            ('broken_package_percent', lib.percent(org_counts['broken_packages'], org_counts['packages'])),
            ('broken_resource_count', org_counts['broken_resources']),
            ('broken_resource_percent', lib.percent(org_counts['broken_resources'], org_counts['resources'])),
93
            )))
94
95
96
97
98
99
100
101
102
        # Totals - always use the counts, rather than counts_with_sub_orgs, to
        # avoid counting a package in both its org and parent org
        org_counts_ = counts[org_name]
        num_broken_packages += org_counts_['broken_packages']
        num_broken_resources += org_counts_['broken_resources']
        num_packages += org_counts_['packages']
        num_resources += org_counts_['resources']

    return {'table': data,
103
104
105
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
106
107
108
109
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            }
110
111
112
113
114


def broken_links_for_organization(organization, include_sub_organizations=False):
    '''
    Returns a dictionary detailing broken resource links for the organization
115
    or if organization it returns the index page for all organizations.
116
117
118
119
120
121
122

    params:
      organization - name of an organization

    Returns:
    {'organization_name': 'cabinet-office',
     'organization_title:': 'Cabinet Office',
123
     'table': [
124
125
126
127
128
129
130
       {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'}
      ...]

    '''
    from ckanext.archiver.model import Archival

    org = model.Group.get(organization)
131
132
    if not org:
        raise p.toolkit.ObjectNotFound()
133
134
135
136
137
138
139
140
141
142
143
144

    name = org.name
    title = org.title

    archivals = model.Session.query(Archival, model.Package, model.Group).\
        filter(Archival.is_broken == True).\
        join(model.Package, Archival.package_id == model.Package.id).\
        filter(model.Package.state == 'active').\
        join(model.Resource, Archival.resource_id == model.Resource.id).\
        filter(model.Resource.state == 'active')

    if not include_sub_organizations:
145
        org_ids = [org.id]
146
147
        archivals = archivals.filter(model.Package.owner_org == org.id)
    else:
148
        # We want any organization_id that is part of this organization's tree
149
        org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)]
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
        archivals = archivals.filter(model.Package.owner_org.in_(org_ids))

    archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id)

    results = []

    for archival, pkg, org in archivals.all():
        pkg = model.Package.get(archival.package_id)
        resource = model.Resource.get(archival.resource_id)

        via = ''
        er = pkg.extras.get('external_reference', '')
        if er == 'ONSHUB':
            via = "Stats Hub"
        elif er.startswith("DATA4NR"):
            via = "Data4nr"

        archived_resource = model.Session.query(model.ResourceRevision)\
                            .filter_by(id=resource.id)\
                            .filter_by(revision_timestamp=archival.resource_timestamp)\
                            .first() or resource
        row_data = OrderedDict((
            ('dataset_title', pkg.title),
            ('dataset_name', pkg.name),
174
            ('dataset_notes', lib.dataset_notes(pkg)),
175
176
177
178
179
180
181
            ('organization_title', org.title),
            ('organization_name', org.name),
            ('resource_position', resource.position),
            ('resource_id', resource.id),
            ('resource_url', archived_resource.url),
            ('url_up_to_date', resource.url == archived_resource.url),
            ('via', via),
David Read's avatar
David Read committed
182
183
184
            ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None),
            ('last_updated', archival.updated.isoformat() if archival.updated else None),
            ('last_success', archival.last_success.isoformat() if archival.last_success else None),
185
186
187
188
189
190
191
192
            ('url_redirected_to', archival.url_redirected_to),
            ('reason', archival.reason),
            ('status', archival.status),
            ('failure_count', archival.failure_count),
            ))

        results.append(row_data)

193
194
195
196
197
    num_broken_packages = archivals.distinct(model.Package.name).count()
    num_broken_resources = len(results)

    # Get total number of packages & resources
    num_packages = model.Session.query(model.Package)\
198
                        .filter(model.Package.owner_org.in_(org_ids))\
199
200
201
                        .filter_by(state='active')\
                        .count()
    num_resources = model.Session.query(model.Resource)\
202
                         .filter_by(state='active')
203
    if p.toolkit.check_ckan_version(max_version='2.2.99'):
204
205
206
207
208
        num_resources = num_resources.join(model.ResourceGroup)
    num_resources = num_resources \
        .join(model.Package)\
        .filter(model.Package.owner_org.in_(org_ids))\
        .filter_by(state='active').count()
209
210
211

    return {'organization_name': name,
            'organization_title': title,
212
213
214
215
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
216
217
218
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            'table': results}
219
220


221
def broken_links_option_combinations():
222
    for organization in lib.all_organizations(include_none=True):
223
224
225
226
227
        for include_sub_organizations in (False, True):
            yield {'organization': organization,
                   'include_sub_organizations': include_sub_organizations}


228
229
broken_links_report_info = {
    'name': 'broken-links',
230
    'description': 'Dataset resource URLs that are found to result in errors when resolved.',
231
    'option_defaults': OrderedDict((('organization', None),
232
233
                                    ('include_sub_organizations', False),
                                    )),
234
235
    'option_combinations': broken_links_option_combinations,
    'generate': broken_links,
236
    'template': 'report/broken_links.html',
237
    }