## $Id: batching.py 7649 2012-02-14 15:00:47Z henrik $
##
## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##
"""SIRP components for batch processing.

Batch processors eat CSV files to add, update or remove large numbers
of certain kinds of objects at once.
"""
import grok
import copy
import csv
import os
import sys
import tempfile
import time
from zope.component import createObject
from zope.interface import Interface
from zope.schema import getFields
from waeup.sirp.interfaces import (
    IBatchProcessor, FatalCSVError, DuplicationError, IObjectConverter)

class BatchProcessor(grok.GlobalUtility):
    """A processor to add, update, or remove data.

    This is a non-active baseclass.
    """
    grok.provides(IBatchProcessor)
    grok.context(Interface)
    grok.baseclass()

    # Name used in pages and forms...
    name = u'Non-registered base importer'

    # Internal name...
    util_name = 'baseimporter'

    # Items for this processor need an interface with zope.schema fields.
    iface = Interface

    # The name must be the same as the util_name attribute in order to
    # register this utility correctly.
    grok.name(util_name)

    # Headers needed to locate items...
    location_fields = ['code', 'faculty_code']

    # A factory with this name must be registered...
    factory_name = 'waeup.Department'

    @property
    def required_fields(self):
        """Required fields that have no default.

        A list of names of field, whose value cannot be set if not
        given during creation. Therefore these fields must exist in
        input.

        Fields with a default != missing_value do not belong to this
        category.
        """
        result = []
        for key, field in getFields(self.iface).items():
            if key in self.location_fields:
                continue
            if field.default is not field.missing_value:
                continue
            if field.required:
                result.append(key)
        return result

    @property
    def req(self):
        result = dict(
            create = self.location_fields + self.required_fields,
            update = self.location_fields,
            remove = self.location_fields,
        )
        return result

    @property
    def available_fields(self):
        result = []
        return sorted(list(set(
                    self.location_fields + getFields(self.iface).keys())))

    def getHeaders(self, mode='create'):
        return self.available_fields

    def checkHeaders(self, headerfields, mode='create'):
        req = self.req[mode]
        # Check for required fields...
        for field in req:
            if not field in headerfields:
                raise FatalCSVError(
                    "Need at least columns %s for import!" %
                    ', '.join(["'%s'" % x for x in req]))
        # Check for double fields. Cannot happen because this error is
        # already catched in views
        not_ignored_fields = [x for x in headerfields
                              if not x.startswith('--')]
        if len(set(not_ignored_fields)) < len(not_ignored_fields):
            raise FatalCSVError(
                "Double headers: each column name may only appear once.")
        return True

    def applyMapping(self, row, mapping):
        """Apply mapping to a row of CSV data.

        """
        result = dict()
        for key, replacement in mapping.items():
            if replacement == u'--IGNORE--':
                # Skip ignored columns in failed and finished data files.
                continue
            result[replacement] = row[key]
        return result

    def getMapping(self, path, headerfields, mode):
        """Get a mapping from CSV file headerfields to actually used fieldnames.

        """
        result = dict()
        reader = csv.reader(open(path, 'rb'))
        raw_header = reader.next()
        for num, field in enumerate(headerfields):
            if field not in self.location_fields and mode == 'remove':
                # Skip non-location fields when removing.
                continue
            if field == u'--IGNORE--':
                # Skip ignored columns in failed and finished data files.
                continue
            result[raw_header[num]] = field
        return result

    def stringFromErrs(self, errors, inv_errors):
        result = []
        for err in errors:
            fieldname, message = err
            result.append("%s: %s" % (fieldname, message))
        for err in inv_errors:
            result.append("invariant: %s" % err)
        return '; '.join(result)

    def callFactory(self, *args, **kw):
        return createObject(self.factory_name)

    def parentsExist(self, row, site):
        """Tell whether the parent object for data in ``row`` exists.
        """
        raise NotImplementedError('method not implemented')

    def entryExists(self, row, site):
        """Tell whether there already exists an entry for ``row`` data.
        """
        raise NotImplementedError('method not implemented')

    def getParent(self, row, site):
        """Get the parent object for the entry in ``row``.
        """
        raise NotImplementedError('method not implemented')

    def getEntry(self, row, site):
        """Get the parent object for the entry in ``row``.
        """
        raise NotImplementedError('method not implemented')

    def addEntry(self, obj, row, site):
        """Add the entry given given by ``row`` data.
        """
        raise NotImplementedError('method not implemented')

    def delEntry(self, row, site):
        """Delete entry given by ``row`` data.
        """
        raise NotImplementedError('method not implemented')

    def updateEntry(self, obj, row, site):
        """Update obj to the values given in row.
        """
        for key, value in row.items():
            # Skip fields not declared in interface.
            if hasattr(obj, key):
                setattr(obj, key, value)
        return

    def createLogfile(self, path, fail_path, num, warnings, mode, user,
                      timedelta, logger=None):
        """Write to log file.
        """
        if logger is None:
            return
        status = 'OK'
        if warnings > 0:
            status = 'FAILED'
        logger.info("-" * 20)
        logger.info("%s: Batch processing finished: %s" % (user, status))
        logger.info("%s: Source: %s" % (user, path))
        logger.info("%s: Mode: %s" % (user, mode))
        logger.info("%s: User: %s" % (user, user))
        if warnings > 0:
            logger.info("%s: Failed datasets: %s" % (
                    user, os.path.basename(fail_path)))
        logger.info("%s: Processing time: %0.3f s (%0.4f s/item)" % (
                user, timedelta, timedelta/(num or 1)))
        logger.info("%s: Processed: %s lines (%s successful/ %s failed)" % (
                user, num, num - warnings, warnings
                ))
        logger.info("-" * 20)
        return

    def writeFailedRow(self, writer, row, warnings):
        """Write a row with error messages to error CSV.

        If warnings is a list of strings, they will be concatenated.
        """
        error_col = warnings
        if isinstance(warnings, list):
            error_col = ' / '.join(warnings)
        row['--ERRORS--'] = error_col
        writer.writerow(row)
        return

    def checkConversion(self, row, mode='ignore'):
        """Validates all values in row.
        """
        converter = IObjectConverter(self.iface)
        errs, inv_errs, conv_dict =  converter.fromStringDict(
            row, self.factory_name)
        return errs, inv_errs, conv_dict

    def doImport(self, path, headerfields, mode='create', user='Unknown',
                 logger=None):
        """Perform actual import.
        """
        time_start = time.time()
        self.checkHeaders(headerfields, mode)
        mapping = self.getMapping(path, headerfields, mode)
        reader = csv.DictReader(open(path, 'rb'))

        temp_dir = tempfile.mkdtemp()

        base = os.path.basename(path)
        (base, ext) = os.path.splitext(base)
        failed_path = os.path.join(temp_dir, "%s.pending%s" % (base, ext))
        failed_headers = mapping.values()
        failed_headers.append('--ERRORS--')
        failed_writer = csv.DictWriter(open(failed_path, 'wb'),
                                       failed_headers)
        failed_writer.writerow(dict([(x,x) for x in failed_headers]))

        finished_path = os.path.join(temp_dir, "%s.finished%s" % (base, ext))
        finished_headers = mapping.values()
        finished_writer = csv.DictWriter(open(finished_path, 'wb'),
                                         finished_headers)
        finished_writer.writerow(dict([(x,x) for x in finished_headers]))

        num =0
        num_warns = 0
        site = grok.getSite()
        
        for raw_row in reader:
            num += 1
            string_row = self.applyMapping(raw_row, mapping)
            row = dict(string_row.items()) # create deep copy
            errs, inv_errs, conv_dict = self.checkConversion(string_row, mode)
            if errs or inv_errs:
                num_warns += 1
                conv_warnings = self.stringFromErrs(errs, inv_errs)
                self.writeFailedRow(
                    failed_writer, string_row, conv_warnings)
                continue
            row.update(conv_dict)

            if mode == 'create':
                if not self.parentsExist(row, site):
                    num_warns += 1
                    self.writeFailedRow(
                        failed_writer, string_row,
                        "Not all parents do exist yet. Skipping")
                    continue
                if self.entryExists(row, site):
                    num_warns += 1
                    self.writeFailedRow(
                        failed_writer, string_row,
                        "This object already exists in the same container. Skipping.")
                    continue
                obj = self.callFactory()
                # Override all values in row, also
                # student_ids and applicant_ids which have been
                # generated in the respective __init__ methods before.
                self.updateEntry(obj, row, site)
                try:
                    self.addEntry(obj, row, site)
                except KeyError, error:
                    num_warns += 1
                    self.writeFailedRow(
                        failed_writer, string_row,
                        "%s Skipping." % error.message)
                    continue
            elif mode == 'remove':
                if not self.entryExists(row, site):
                    num_warns += 1
                    self.writeFailedRow(
                        failed_writer, string_row,
                        "Cannot remove: no such entry.")
                    continue
                self.delEntry(row, site)
            elif mode == 'update':
                obj = self.getEntry(row, site)
                if obj is None:
                    num_warns += 1
                    self.writeFailedRow(
                        failed_writer, string_row,
                        "Cannot update: no such entry.")
                    continue
                self.updateEntry(obj, row, site)
            finished_writer.writerow(string_row)

        time_end = time.time()
        timedelta = time_end - time_start

        self.createLogfile(path, failed_path, num, num_warns, mode, user,
                           timedelta, logger=logger)
        failed_path = os.path.abspath(failed_path)
        if num_warns == 0:
            del failed_writer
            os.unlink(failed_path)
            failed_path = None
        return (num, num_warns,
                os.path.abspath(finished_path), failed_path)