## $Id: batching.py 8887 2012-07-03 10:41:56Z henrik $ ## ## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ## """Kofa components for batch processing. Batch processors eat CSV files to add, update or remove large numbers of certain kinds of objects at once. """ import grok import csv import datetime import os import tempfile import time from cStringIO import StringIO from zope.component import createObject from zope.interface import Interface from zope.schema import getFields from zope.event import notify from waeup.kofa.interfaces import ( IBatchProcessor, FatalCSVError, IObjectConverter, ICSVExporter, IGNORE_MARKER, DuplicationError) class BatchProcessor(grok.GlobalUtility): """A processor to add, update, or remove data. This is a non-active baseclass. """ grok.implements(IBatchProcessor) grok.context(Interface) grok.baseclass() # Name used in pages and forms... name = u'Non-registered base processor' # Internal name... util_name = 'baseprocessor' # Items for this processor need an interface with zope.schema fields. iface = Interface # The name must be the same as the util_name attribute in order to # register this utility correctly. grok.name(util_name) # Headers needed to locate items... location_fields = ['code', 'faculty_code'] # A factory with this name must be registered... factory_name = 'waeup.Department' @property def required_fields(self): """Required fields that have no default. A list of names of field, whose value cannot be set if not given during creation. Therefore these fields must exist in input. Fields with a default != missing_value do not belong to this category. """ result = [] for key, field in getFields(self.iface).items(): if key in self.location_fields: continue if field.default is not field.missing_value: continue if field.required: result.append(key) return result @property def req(self): result = dict( create = self.location_fields + self.required_fields, update = self.location_fields, remove = self.location_fields, ) return result @property def available_fields(self): return sorted(list(set( self.location_fields + getFields(self.iface).keys()))) def getHeaders(self, mode='create'): return self.available_fields def checkHeaders(self, headerfields, mode='create'): req = self.req[mode] # Check for required fields... for field in req: if not field in headerfields: raise FatalCSVError( "Need at least columns %s for import!" % ', '.join(["'%s'" % x for x in req])) # Check for double fields. Cannot happen because this error is # already catched in views not_ignored_fields = [x for x in headerfields if not x.startswith('--')] if len(set(not_ignored_fields)) < len(not_ignored_fields): raise FatalCSVError( "Double headers: each column name may only appear once.") return True def applyMapping(self, row, mapping): """Apply mapping to a row of CSV data. """ result = dict() for key, replacement in mapping.items(): if replacement == u'--IGNORE--': # Skip ignored columns in failed and finished data files. continue result[replacement] = row[key] return result def getMapping(self, path, headerfields, mode): """Get a mapping from CSV file headerfields to actually used fieldnames. """ result = dict() reader = csv.reader(open(path, 'rb')) raw_header = reader.next() for num, field in enumerate(headerfields): if field not in self.location_fields and mode == 'remove': # Skip non-location fields when removing. continue if field == u'--IGNORE--': # Skip ignored columns in failed and finished data files. continue result[raw_header[num]] = field return result def stringFromErrs(self, errors, inv_errors): result = [] for err in errors: fieldname, message = err result.append("%s: %s" % (fieldname, message)) for err in inv_errors: result.append("invariant: %s" % err) return '; '.join(result) def callFactory(self, *args, **kw): return createObject(self.factory_name) def parentsExist(self, row, site): """Tell whether the parent object for data in ``row`` exists. """ raise NotImplementedError('method not implemented') def entryExists(self, row, site): """Tell whether there already exists an entry for ``row`` data. """ raise NotImplementedError('method not implemented') def getParent(self, row, site): """Get the parent object for the entry in ``row``. """ raise NotImplementedError('method not implemented') def getEntry(self, row, site): """Get the parent object for the entry in ``row``. """ raise NotImplementedError('method not implemented') def addEntry(self, obj, row, site): """Add the entry given given by ``row`` data. """ raise NotImplementedError('method not implemented') def delEntry(self, row, site): """Delete entry given by ``row`` data. """ raise NotImplementedError('method not implemented') def checkUpdateRequirements(self, obj, row, site): """Checks requirements the object must fulfill when being updated. This method is not used in case of deleting or adding objects. Returns error messages as strings in case of requirement problems. """ return None def updateEntry(self, obj, row, site): """Update obj to the values given in row. Returns a string describing the fields changed. """ changed = [] for key, value in row.items(): # Skip fields to be ignored. if value == IGNORE_MARKER: continue # Skip fields not declared in interface and which are # not yet attributes of existing objects. We can thus not # add non-existing attributes here. if not hasattr(obj, key): continue setattr(obj, key, value) log_value = getattr(value, 'code', value) changed.append('%s=%s' % (key, log_value)) # If any catalog is involved it must be updated. # # XXX: The event is also triggered when creating objects as # updateEntry is called also when creating entries resulting # in objectAdded and additional objectModified events. if len(changed): notify(grok.ObjectModifiedEvent(obj)) return ', '.join(changed) def createLogfile(self, path, fail_path, num, warnings, mode, user, timedelta, logger=None): """Write to log file. """ if logger is None: return status = 'OK' if warnings > 0: status = 'FAILED' logger.info("-" * 20) logger.info("%s: Batch processing finished: %s" % (user, status)) logger.info("%s: Source: %s" % (user, path)) logger.info("%s: Mode: %s" % (user, mode)) logger.info("%s: User: %s" % (user, user)) if warnings > 0: logger.info("%s: Failed datasets: %s" % ( user, os.path.basename(fail_path))) logger.info("%s: Processing time: %0.3f s (%0.4f s/item)" % ( user, timedelta, timedelta/(num or 1))) logger.info("%s: Processed: %s lines (%s successful/ %s failed)" % ( user, num, num - warnings, warnings )) logger.info("-" * 20) return def writeFailedRow(self, writer, row, warnings): """Write a row with error messages to error CSV. If warnings is a list of strings, they will be concatenated. """ error_col = warnings if isinstance(warnings, list): error_col = ' / '.join(warnings) row['--ERRORS--'] = error_col writer.writerow(row) return def checkConversion(self, row, mode='ignore', ignore_empty=True): """Validates all values in row. """ converter = IObjectConverter(self.iface) errs, inv_errs, conv_dict = converter.fromStringDict( row, self.factory_name, mode=mode) return errs, inv_errs, conv_dict def doImport(self, path, headerfields, mode='create', user='Unknown', logger=None, ignore_empty=True): """Perform actual import. """ time_start = time.time() self.checkHeaders(headerfields, mode) mapping = self.getMapping(path, headerfields, mode) reader = csv.DictReader(open(path, 'rb')) temp_dir = tempfile.mkdtemp() base = os.path.basename(path) (base, ext) = os.path.splitext(base) failed_path = os.path.join(temp_dir, "%s.pending%s" % (base, ext)) failed_headers = mapping.values() failed_headers.append('--ERRORS--') failed_writer = csv.DictWriter(open(failed_path, 'wb'), failed_headers) os.chmod(failed_path, 0664) failed_writer.writerow(dict([(x,x) for x in failed_headers])) finished_path = os.path.join(temp_dir, "%s.finished%s" % (base, ext)) finished_headers = mapping.values() finished_writer = csv.DictWriter(open(finished_path, 'wb'), finished_headers) finished_writer.writerow(dict([(x,x) for x in finished_headers])) os.chmod(finished_path, 0664) num =0 num_warns = 0 site = grok.getSite() for raw_row in reader: num += 1 string_row = self.applyMapping(raw_row, mapping) if ignore_empty and mode in ('update',): # replace empty strings with ignore-markers for key, val in string_row.items(): if val == '': string_row[key] = IGNORE_MARKER row = dict(string_row.items()) # create deep copy errs, inv_errs, conv_dict = self.checkConversion(string_row, mode) if errs or inv_errs: num_warns += 1 conv_warnings = self.stringFromErrs(errs, inv_errs) self.writeFailedRow( failed_writer, string_row, conv_warnings) continue row.update(conv_dict) if mode == 'create': if not self.parentsExist(row, site): num_warns += 1 self.writeFailedRow( failed_writer, string_row, "Not all parents do exist yet. Skipping") continue if self.entryExists(row, site): num_warns += 1 self.writeFailedRow( failed_writer, string_row, "This object already exists. Skipping.") continue obj = self.callFactory() # Override all values in row, also # student_ids and applicant_ids which have been # generated in the respective __init__ methods before. self.updateEntry(obj, row, site) try: self.addEntry(obj, row, site) except KeyError, error: num_warns += 1 self.writeFailedRow( failed_writer, string_row, "%s Skipping." % error.message) continue except DuplicationError, error: num_warns += 1 self.writeFailedRow( failed_writer, string_row, "%s Skipping." % error.msg) continue elif mode == 'remove': if not self.entryExists(row, site): num_warns += 1 self.writeFailedRow( failed_writer, string_row, "Cannot remove: no such entry.") continue self.delEntry(row, site) elif mode == 'update': obj = self.getEntry(row, site) if obj is None: num_warns += 1 self.writeFailedRow( failed_writer, string_row, "Cannot update: no such entry.") continue update_errors = self.checkUpdateRequirements(obj, row, site) if update_errors is not None: num_warns += 1 self.writeFailedRow( failed_writer, string_row, update_errors) continue self.updateEntry(obj, row, site) finished_writer.writerow(string_row) time_end = time.time() timedelta = time_end - time_start self.createLogfile(path, failed_path, num, num_warns, mode, user, timedelta, logger=logger) failed_path = os.path.abspath(failed_path) if num_warns == 0: del failed_writer os.unlink(failed_path) failed_path = None return (num, num_warns, os.path.abspath(finished_path), failed_path) class ExporterBase(object): """A base for exporters. """ grok.implements(ICSVExporter) #: Fieldnames considered by this exporter fields = ('code', 'title', 'title_prefix') #: The title under which this exporter will be displayed #: (if registered as a utility) title = 'Override this title' def mangle_value(self, value, name, context=None): """Hook for mangling values in derived classes """ if isinstance(value, bool): value = value and '1' or '0' elif isinstance(value, unicode): # CSV writers like byte streams better than unicode value = value.encode('utf-8') elif isinstance(value, datetime.datetime): value = str(value) elif isinstance(value, datetime.date): # Order is important here: check for date after datetime as # datetimes are also dates. # # Append hash '#' to dates to circumvent unwanted excel automatic value = str('%s#' % value) elif value is None: # None is not really representable in CSV files value = '' return value def get_csv_writer(self, filepath=None): """Get a CSV dict writer instance open for writing. Returns a tuple (, ) where ```` is a :class:`csv.DictWriter` instance and outfile is the real file which is written to. The latter is important when writing to StringIO and can normally be ignored otherwise. The returned file will already be filled with the header row. Please note that if you give a filepath, the returned outfile is open for writing only and you might have to close it before reopening it for reading. """ if filepath is None: outfile = StringIO() else: outfile = open(filepath, 'wb') writer = csv.DictWriter(outfile, self.fields) writer.writerow(dict(zip(self.fields, self.fields))) # header return writer, outfile def write_item(self, obj, writer): """Write a row extracted from `obj` into CSV file using `writer`. """ row = {} for name in self.fields: value = getattr(obj, name, None) value = self.mangle_value(value, name, obj) row[name] = value writer.writerow(row) return def close_outfile(self, filepath, outfile): """Close outfile. If filepath is None, the contents of outfile is returned. """ outfile.seek(0) if filepath is None: return outfile.read() outfile.close() return def export(self, iterable, filepath=None): """Export `iterable` as CSV file. If `filepath` is ``None``, a raw string with CSV data should be returned. """ raise NotImplementedError def export_all(self, site, filepath=None): """Export all appropriate objects in `site` into `filepath` as CSV data. If `filepath` is ``None``, a raw string with CSV data should be returned. """ raise NotImplementedError