## $Id: batching.py 7196 2011-11-25 07:44:52Z henrik $ ## ## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ## """WAeUP components for batch processing. Batch processors eat CSV files to add, update or remove large numbers of certain kinds of objects at once. """ import grok import copy import csv import os import sys import tempfile import time from zope.component import createObject from zope.interface import Interface from zope.schema import getFields from waeup.sirp.interfaces import ( IBatchProcessor, FatalCSVError, DuplicationError, IObjectConverter) class BatchProcessor(grok.GlobalUtility): """A processor to add, update, or remove data. This is a non-active baseclass. """ grok.provides(IBatchProcessor) grok.context(Interface) grok.baseclass() # Name used in pages and forms... name = u'Non-registered base importer' # Internal name... util_name = 'baseimporter' # Items for this processor need an interface with zope.schema fields. iface = Interface # The name must be the same as the util_name attribute in order to # register this utility correctly. grok.name(util_name) # Headers needed to locate items... location_fields = ['code', 'faculty_code'] # A factory with this name must be registered... factory_name = 'waeup.Department' @property def required_fields(self): """Required fields that have no default. A list of names of field, whose value cannot be set if not given during creation. Therefore these fields must exist in input. Fields with a default != missing_value do not belong to this category. """ result = [] for key, field in getFields(self.iface).items(): if key in self.location_fields: continue if field.default is not field.missing_value: continue if field.required: result.append(key) return result @property def req(self): result = dict( create = self.location_fields + self.required_fields, update = self.location_fields, remove = self.location_fields, ) return result @property def available_fields(self): result = [] return sorted(list(set( self.location_fields + getFields(self.iface).keys()))) def getHeaders(self, mode='create'): return self.available_fields def checkHeaders(self, headerfields, mode='create'): req = self.req[mode] # Check for required fields... for field in req: if not field in headerfields: raise FatalCSVError( "Need at least columns %s for import!" % ', '.join(["'%s'" % x for x in req])) # Check for double fields. Cannot happen because this error is # already catched in views not_ignored_fields = [x for x in headerfields if not x.startswith('--')] if len(set(not_ignored_fields)) < len(not_ignored_fields): raise FatalCSVError( "Double headers: each column name may only appear once.") return True def applyMapping(self, row, mapping): """Apply mapping to a row of CSV data. """ result = dict() for key, replacement in mapping.items(): if replacement == u'--IGNORE--': # Skip ignored columns in failed and finished data files. continue result[replacement] = row[key] return result def getMapping(self, path, headerfields, mode): """Get a mapping from CSV file headerfields to actually used fieldnames. """ result = dict() reader = csv.reader(open(path, 'rb')) raw_header = reader.next() for num, field in enumerate(headerfields): if field not in self.location_fields and mode == 'remove': # Skip non-location fields when removing. continue if field == u'--IGNORE--': # Skip ignored columns in failed and finished data files. continue result[raw_header[num]] = field return result def stringFromErrs(self, errors, inv_errors): result = [] for err in errors: fieldname, message = err result.append("%s: %s" % (fieldname, message)) for err in inv_errors: result.append("invariant: %s" % err) return '; '.join(result) def callFactory(self, *args, **kw): return createObject(self.factory_name) def parentsExist(self, row, site): """Tell whether the parent object for data in ``row`` exists. """ raise NotImplementedError('method not implemented') def entryExists(self, row, site): """Tell whether there already exists an entry for ``row`` data. """ raise NotImplementedError('method not implemented') def getParent(self, row, site): """Get the parent object for the entry in ``row``. """ raise NotImplementedError('method not implemented') def getEntry(self, row, site): """Get the parent object for the entry in ``row``. """ raise NotImplementedError('method not implemented') def addEntry(self, obj, row, site): """Add the entry given given by ``row`` data. """ raise NotImplementedError('method not implemented') def delEntry(self, row, site): """Delete entry given by ``row`` data. """ raise NotImplementedError('method not implemented') def updateEntry(self, obj, row, site): """Update obj to the values given in row. """ for key, value in row.items(): # Skip fields not declared in interface. if hasattr(obj, key): setattr(obj, key, value) return def createLogfile(self, path, fail_path, num, warnings, mode, user, timedelta, logger=None): """Write to log file. """ if logger is None: return status = 'OK' if warnings > 0: status = 'FAILED' logger.info("-" * 20) logger.info("%s: Batch processing finished: %s" % (user, status)) logger.info("%s: Source: %s" % (user, path)) logger.info("%s: Mode: %s" % (user, mode)) logger.info("%s: User: %s" % (user, user)) if warnings > 0: logger.info("%s: Failed datasets: %s" % ( user, os.path.basename(fail_path))) logger.info("%s: Processing time: %0.3f s (%0.4f s/item)" % ( user, timedelta, timedelta/(num or 1))) logger.info("%s: Processed: %s lines (%s successful/ %s failed)" % ( user, num, num - warnings, warnings )) logger.info("-" * 20) return def writeFailedRow(self, writer, row, warnings): """Write a row with error messages to error CSV. If warnings is a list of strings, they will be concatenated. """ error_col = warnings if isinstance(warnings, list): error_col = ' / '.join(warnings) row['--ERRORS--'] = error_col writer.writerow(row) return def checkConversion(self, row, mode='ignore'): """Validates all values in row. """ converter = IObjectConverter(self.iface) errs, inv_errs, conv_dict = converter.fromStringDict( row, self.factory_name) return errs, inv_errs, conv_dict def doImport(self, path, headerfields, mode='create', user='Unknown', logger=None): """Perform actual import. """ time_start = time.time() self.checkHeaders(headerfields, mode) mapping = self.getMapping(path, headerfields, mode) reader = csv.DictReader(open(path, 'rb')) temp_dir = tempfile.mkdtemp() base = os.path.basename(path) (base, ext) = os.path.splitext(base) failed_path = os.path.join(temp_dir, "%s.pending%s" % (base, ext)) failed_headers = mapping.values() failed_headers.append('--ERRORS--') failed_writer = csv.DictWriter(open(failed_path, 'wb'), failed_headers) failed_writer.writerow(dict([(x,x) for x in failed_headers])) finished_path = os.path.join(temp_dir, "%s.finished%s" % (base, ext)) finished_headers = mapping.values() finished_writer = csv.DictWriter(open(finished_path, 'wb'), finished_headers) finished_writer.writerow(dict([(x,x) for x in finished_headers])) num =0 num_warns = 0 site = grok.getSite() for raw_row in reader: num += 1 string_row = self.applyMapping(raw_row, mapping) row = dict(string_row.items()) # create deep copy errs, inv_errs, conv_dict = self.checkConversion(string_row, mode) if errs or inv_errs: num_warns += 1 conv_warnings = self.stringFromErrs(errs, inv_errs) self.writeFailedRow( failed_writer, string_row, conv_warnings) continue row.update(conv_dict) if mode == 'create': if not self.parentsExist(row, site): num_warns += 1 self.writeFailedRow( failed_writer, string_row, "Not all parents do exist yet. Skipping") continue if self.entryExists(row, site): num_warns += 1 self.writeFailedRow( failed_writer, string_row, "This object already exists in the same container. Skipping.") continue obj = self.callFactory() self.updateEntry(obj, row, site) try: self.addEntry(obj, row, site) except KeyError, error: num_warns += 1 self.writeFailedRow( failed_writer, string_row, "%s Skipping." % error.message) continue elif mode == 'remove': if not self.entryExists(row, site): num_warns += 1 self.writeFailedRow( failed_writer, string_row, "Cannot remove: no such entry.") continue self.delEntry(row, site) elif mode == 'update': obj = self.getEntry(row, site) if obj is None: num_warns += 1 self.writeFailedRow( failed_writer, string_row, "Cannot update: no such entry.") continue self.updateEntry(obj, row, site) finished_writer.writerow(string_row) time_end = time.time() timedelta = time_end - time_start self.createLogfile(path, failed_path, num, num_warns, mode, user, timedelta, logger=logger) failed_path = os.path.abspath(failed_path) if num_warns == 0: del failed_writer os.unlink(failed_path) failed_path = None return (num, num_warns, os.path.abspath(finished_path), failed_path)