source: main/waeup.kofa/trunk/src/waeup/kofa/utils/batching.py @ 7918

Last change on this file since 7918 was 7907, checked in by uli, 13 years ago

Add title attribute for exporters.

  • Property svn:keywords set to Id
File size: 15.0 KB
Line 
1## $Id: batching.py 7907 2012-03-18 14:25:25Z uli $
2##
3## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""Kofa components for batch processing.
19
20Batch processors eat CSV files to add, update or remove large numbers
21of certain kinds of objects at once.
22"""
23import grok
24import csv
25import os
26import tempfile
27import time
28from cStringIO import StringIO
29from zope.component import createObject
30from zope.interface import Interface
31from zope.schema import getFields
32from waeup.kofa.interfaces import (
33    IBatchProcessor, FatalCSVError, IObjectConverter,
34    ICSVExporter)
35
36class BatchProcessor(grok.GlobalUtility):
37    """A processor to add, update, or remove data.
38
39    This is a non-active baseclass.
40    """
41    grok.provides(IBatchProcessor)
42    grok.context(Interface)
43    grok.baseclass()
44
45    # Name used in pages and forms...
46    name = u'Non-registered base importer'
47
48    # Internal name...
49    util_name = 'baseimporter'
50
51    # Items for this processor need an interface with zope.schema fields.
52    iface = Interface
53
54    # The name must be the same as the util_name attribute in order to
55    # register this utility correctly.
56    grok.name(util_name)
57
58    # Headers needed to locate items...
59    location_fields = ['code', 'faculty_code']
60
61    # A factory with this name must be registered...
62    factory_name = 'waeup.Department'
63
64    @property
65    def required_fields(self):
66        """Required fields that have no default.
67
68        A list of names of field, whose value cannot be set if not
69        given during creation. Therefore these fields must exist in
70        input.
71
72        Fields with a default != missing_value do not belong to this
73        category.
74        """
75        result = []
76        for key, field in getFields(self.iface).items():
77            if key in self.location_fields:
78                continue
79            if field.default is not field.missing_value:
80                continue
81            if field.required:
82                result.append(key)
83        return result
84
85    @property
86    def req(self):
87        result = dict(
88            create = self.location_fields + self.required_fields,
89            update = self.location_fields,
90            remove = self.location_fields,
91        )
92        return result
93
94    @property
95    def available_fields(self):
96        return sorted(list(set(
97                    self.location_fields + getFields(self.iface).keys())))
98
99    def getHeaders(self, mode='create'):
100        return self.available_fields
101
102    def checkHeaders(self, headerfields, mode='create'):
103        req = self.req[mode]
104        # Check for required fields...
105        for field in req:
106            if not field in headerfields:
107                raise FatalCSVError(
108                    "Need at least columns %s for import!" %
109                    ', '.join(["'%s'" % x for x in req]))
110        # Check for double fields. Cannot happen because this error is
111        # already catched in views
112        not_ignored_fields = [x for x in headerfields
113                              if not x.startswith('--')]
114        if len(set(not_ignored_fields)) < len(not_ignored_fields):
115            raise FatalCSVError(
116                "Double headers: each column name may only appear once.")
117        return True
118
119    def applyMapping(self, row, mapping):
120        """Apply mapping to a row of CSV data.
121
122        """
123        result = dict()
124        for key, replacement in mapping.items():
125            if replacement == u'--IGNORE--':
126                # Skip ignored columns in failed and finished data files.
127                continue
128            result[replacement] = row[key]
129        return result
130
131    def getMapping(self, path, headerfields, mode):
132        """Get a mapping from CSV file headerfields to actually used fieldnames.
133
134        """
135        result = dict()
136        reader = csv.reader(open(path, 'rb'))
137        raw_header = reader.next()
138        for num, field in enumerate(headerfields):
139            if field not in self.location_fields and mode == 'remove':
140                # Skip non-location fields when removing.
141                continue
142            if field == u'--IGNORE--':
143                # Skip ignored columns in failed and finished data files.
144                continue
145            result[raw_header[num]] = field
146        return result
147
148    def stringFromErrs(self, errors, inv_errors):
149        result = []
150        for err in errors:
151            fieldname, message = err
152            result.append("%s: %s" % (fieldname, message))
153        for err in inv_errors:
154            result.append("invariant: %s" % err)
155        return '; '.join(result)
156
157    def callFactory(self, *args, **kw):
158        return createObject(self.factory_name)
159
160    def parentsExist(self, row, site):
161        """Tell whether the parent object for data in ``row`` exists.
162        """
163        raise NotImplementedError('method not implemented')
164
165    def entryExists(self, row, site):
166        """Tell whether there already exists an entry for ``row`` data.
167        """
168        raise NotImplementedError('method not implemented')
169
170    def getParent(self, row, site):
171        """Get the parent object for the entry in ``row``.
172        """
173        raise NotImplementedError('method not implemented')
174
175    def getEntry(self, row, site):
176        """Get the parent object for the entry in ``row``.
177        """
178        raise NotImplementedError('method not implemented')
179
180    def addEntry(self, obj, row, site):
181        """Add the entry given given by ``row`` data.
182        """
183        raise NotImplementedError('method not implemented')
184
185    def delEntry(self, row, site):
186        """Delete entry given by ``row`` data.
187        """
188        raise NotImplementedError('method not implemented')
189
190    def updateEntry(self, obj, row, site):
191        """Update obj to the values given in row.
192        """
193        for key, value in row.items():
194            # Skip fields not declared in interface.
195            if hasattr(obj, key):
196                setattr(obj, key, value)
197        return
198
199    def createLogfile(self, path, fail_path, num, warnings, mode, user,
200                      timedelta, logger=None):
201        """Write to log file.
202        """
203        if logger is None:
204            return
205        status = 'OK'
206        if warnings > 0:
207            status = 'FAILED'
208        logger.info("-" * 20)
209        logger.info("%s: Batch processing finished: %s" % (user, status))
210        logger.info("%s: Source: %s" % (user, path))
211        logger.info("%s: Mode: %s" % (user, mode))
212        logger.info("%s: User: %s" % (user, user))
213        if warnings > 0:
214            logger.info("%s: Failed datasets: %s" % (
215                    user, os.path.basename(fail_path)))
216        logger.info("%s: Processing time: %0.3f s (%0.4f s/item)" % (
217                user, timedelta, timedelta/(num or 1)))
218        logger.info("%s: Processed: %s lines (%s successful/ %s failed)" % (
219                user, num, num - warnings, warnings
220                ))
221        logger.info("-" * 20)
222        return
223
224    def writeFailedRow(self, writer, row, warnings):
225        """Write a row with error messages to error CSV.
226
227        If warnings is a list of strings, they will be concatenated.
228        """
229        error_col = warnings
230        if isinstance(warnings, list):
231            error_col = ' / '.join(warnings)
232        row['--ERRORS--'] = error_col
233        writer.writerow(row)
234        return
235
236    def checkConversion(self, row, mode='ignore'):
237        """Validates all values in row.
238        """
239        converter = IObjectConverter(self.iface)
240        errs, inv_errs, conv_dict =  converter.fromStringDict(
241            row, self.factory_name)
242        return errs, inv_errs, conv_dict
243
244    def doImport(self, path, headerfields, mode='create', user='Unknown',
245                 logger=None):
246        """Perform actual import.
247        """
248        time_start = time.time()
249        self.checkHeaders(headerfields, mode)
250        mapping = self.getMapping(path, headerfields, mode)
251        reader = csv.DictReader(open(path, 'rb'))
252
253        temp_dir = tempfile.mkdtemp()
254
255        base = os.path.basename(path)
256        (base, ext) = os.path.splitext(base)
257        failed_path = os.path.join(temp_dir, "%s.pending%s" % (base, ext))
258        failed_headers = mapping.values()
259        failed_headers.append('--ERRORS--')
260        failed_writer = csv.DictWriter(open(failed_path, 'wb'),
261                                       failed_headers)
262        failed_writer.writerow(dict([(x,x) for x in failed_headers]))
263
264        finished_path = os.path.join(temp_dir, "%s.finished%s" % (base, ext))
265        finished_headers = mapping.values()
266        finished_writer = csv.DictWriter(open(finished_path, 'wb'),
267                                         finished_headers)
268        finished_writer.writerow(dict([(x,x) for x in finished_headers]))
269
270        num =0
271        num_warns = 0
272        site = grok.getSite()
273
274        for raw_row in reader:
275            num += 1
276            string_row = self.applyMapping(raw_row, mapping)
277            row = dict(string_row.items()) # create deep copy
278            errs, inv_errs, conv_dict = self.checkConversion(string_row, mode)
279            if errs or inv_errs:
280                num_warns += 1
281                conv_warnings = self.stringFromErrs(errs, inv_errs)
282                self.writeFailedRow(
283                    failed_writer, string_row, conv_warnings)
284                continue
285            row.update(conv_dict)
286
287            if mode == 'create':
288                if not self.parentsExist(row, site):
289                    num_warns += 1
290                    self.writeFailedRow(
291                        failed_writer, string_row,
292                        "Not all parents do exist yet. Skipping")
293                    continue
294                if self.entryExists(row, site):
295                    num_warns += 1
296                    self.writeFailedRow(
297                        failed_writer, string_row,
298                        "This object already exists in the same container. Skipping.")
299                    continue
300                obj = self.callFactory()
301                # Override all values in row, also
302                # student_ids and applicant_ids which have been
303                # generated in the respective __init__ methods before.
304                self.updateEntry(obj, row, site)
305                try:
306                    self.addEntry(obj, row, site)
307                except KeyError, error:
308                    num_warns += 1
309                    self.writeFailedRow(
310                        failed_writer, string_row,
311                        "%s Skipping." % error.message)
312                    continue
313            elif mode == 'remove':
314                if not self.entryExists(row, site):
315                    num_warns += 1
316                    self.writeFailedRow(
317                        failed_writer, string_row,
318                        "Cannot remove: no such entry.")
319                    continue
320                self.delEntry(row, site)
321            elif mode == 'update':
322                obj = self.getEntry(row, site)
323                if obj is None:
324                    num_warns += 1
325                    self.writeFailedRow(
326                        failed_writer, string_row,
327                        "Cannot update: no such entry.")
328                    continue
329                self.updateEntry(obj, row, site)
330            finished_writer.writerow(string_row)
331
332        time_end = time.time()
333        timedelta = time_end - time_start
334
335        self.createLogfile(path, failed_path, num, num_warns, mode, user,
336                           timedelta, logger=logger)
337        failed_path = os.path.abspath(failed_path)
338        if num_warns == 0:
339            del failed_writer
340            os.unlink(failed_path)
341            failed_path = None
342        return (num, num_warns,
343                os.path.abspath(finished_path), failed_path)
344
345class ExporterBase(object):
346    """A base for exporters.
347    """
348    grok.implements(ICSVExporter)
349
350    #: Fieldnames considered by this exporter
351    fields = ('code', 'title', 'title_prefix')
352
353    #: The title under which this exporter will be displayed
354    #: (if registered as a utility)
355    title = 'Override this title'
356
357    def mangle_value(self, value, name, context=None):
358        """Hook for mangling values in derived classes
359        """
360        if isinstance(value, bool):
361            value = value and '1' or '0'
362        elif isinstance(value, unicode):
363            # CSV writers like byte streams better than unicode
364            value = value.encode('utf-8')
365        elif value is None:
366            # None is not really representable in CSV files
367            value = ''
368        return value
369
370    def get_csv_writer(self, filepath=None):
371        """Get a CSV dict writer instance open for writing.
372
373        Returns a tuple (<writer>, <outfile>) where ``<writer>`` is a
374        :class:`csv.DictWriter` instance and outfile is the real file
375        which is written to. The latter is important when writing to
376        StringIO and can normally be ignored otherwise.
377
378        The returned file will already be filled with the header row.
379
380        Please note that if you give a filepath, the returned outfile
381        is open for writing only and you might have to close it before
382        reopening it for reading.
383        """
384        if filepath is None:
385            outfile = StringIO()
386        else:
387            outfile = open(filepath, 'wb')
388        writer = csv.DictWriter(outfile, self.fields)
389        writer.writerow(dict(zip(self.fields, self.fields))) # header
390        return writer, outfile
391
392    def write_item(self, obj, writer):
393        """Write a row extracted from `obj` into CSV file using `writer`.
394        """
395        row = {}
396        for name in self.fields:
397            value = getattr(obj, name, None)
398            value = self.mangle_value(value, name, obj)
399            row[name] = value
400        writer.writerow(row)
401        return
402
403    def close_outfile(self, filepath, outfile):
404        """Close outfile.
405
406        If filepath is None, the contents of outfile is returned.
407        """
408        outfile.seek(0)
409        if filepath is None:
410            return outfile.read()
411        outfile.close()
412        return
413
414    def export(self, iterable, filepath=None):
415        """Export `iterable` as CSV file.
416
417        If `filepath` is ``None``, a raw string with CSV data should
418        be returned.
419        """
420        raise NotImplementedError
421
422    def export_all(self, site, filepath=None):
423        """Export all appropriate objects in `site` into `filepath` as
424        CSV data.
425
426        If `filepath` is ``None``, a raw string with CSV data should
427        be returned.
428        """
429        raise NotImplementedError
Note: See TracBrowser for help on using the repository browser.