Context navigation

source: main/waeup.kofa/trunk/src/waeup/kofa/utils/batching.py @ 8333

Last change on this file since 8333 was 8333, checked in by uli, 13 years ago
Do not notify if objects stay unchanged.
Property svn:keywords set to `Id`
File size: 16.7 KB

Line
1	## $Id: batching.py 8333 2012-05-03 13:01:03Z uli $
2	##
3	## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4	## This program is free software; you can redistribute it and/or modify
5	## it under the terms of the GNU General Public License as published by
6	## the Free Software Foundation; either version 2 of the License, or
7	## (at your option) any later version.
8	##
9	## This program is distributed in the hope that it will be useful,
10	## but WITHOUT ANY WARRANTY; without even the implied warranty of
11	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	## GNU General Public License for more details.
13	##
14	## You should have received a copy of the GNU General Public License
15	## along with this program; if not, write to the Free Software
16	## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17	##
18	"""Kofa components for batch processing.
19
20	Batch processors eat CSV files to add, update or remove large numbers
21	of certain kinds of objects at once.
22	"""
23	import grok
24	import csv
25	import os
26	import tempfile
27	import time
28	from cStringIO import StringIO
29	from zope.component import createObject
30	from zope.interface import Interface
31	from zope.schema import getFields
32	from zope.event import notify
33	from waeup.kofa.interfaces import (
34	IBatchProcessor, FatalCSVError, IObjectConverter,
35	ICSVExporter, IGNORE_MARKER)
36
37	class BatchProcessor(grok.GlobalUtility):
38	"""A processor to add, update, or remove data.
39
40	This is a non-active baseclass.
41	"""
42	grok.implements(IBatchProcessor)
43	grok.context(Interface)
44	grok.baseclass()
45
46	# Name used in pages and forms...
47	name = u'Non-registered base processor'
48
49	# Internal name...
50	util_name = 'baseprocessor'
51
52	# Items for this processor need an interface with zope.schema fields.
53	iface = Interface
54
55	# The name must be the same as the util_name attribute in order to
56	# register this utility correctly.
57	grok.name(util_name)
58
59	# Headers needed to locate items...
60	location_fields = ['code', 'faculty_code']
61
62	# A factory with this name must be registered...
63	factory_name = 'waeup.Department'
64
65	@property
66	def required_fields(self):
67	"""Required fields that have no default.
68
69	A list of names of field, whose value cannot be set if not
70	given during creation. Therefore these fields must exist in
71	input.
72
73	Fields with a default != missing_value do not belong to this
74	category.
75	"""
76	result = []
77	for key, field in getFields(self.iface).items():
78	if key in self.location_fields:
79	continue
80	if field.default is not field.missing_value:
81	continue
82	if field.required:
83	result.append(key)
84	return result
85
86	@property
87	def req(self):
88	result = dict(
89	create = self.location_fields + self.required_fields,
90	update = self.location_fields,
91	remove = self.location_fields,
92	)
93	return result
94
95	@property
96	def available_fields(self):
97	return sorted(list(set(
98	self.location_fields + getFields(self.iface).keys())))
99
100	def getHeaders(self, mode='create'):
101	return self.available_fields
102
103	def checkHeaders(self, headerfields, mode='create'):
104	req = self.req[mode]
105	# Check for required fields...
106	for field in req:
107	if not field in headerfields:
108	raise FatalCSVError(
109	"Need at least columns %s for import!" %
110	', '.join(["'%s'" % x for x in req]))
111	# Check for double fields. Cannot happen because this error is
112	# already catched in views
113	not_ignored_fields = [x for x in headerfields
114	if not x.startswith('--')]
115	if len(set(not_ignored_fields)) < len(not_ignored_fields):
116	raise FatalCSVError(
117	"Double headers: each column name may only appear once.")
118	return True
119
120	def applyMapping(self, row, mapping):
121	"""Apply mapping to a row of CSV data.
122
123	"""
124	result = dict()
125	for key, replacement in mapping.items():
126	if replacement == u'--IGNORE--':
127	# Skip ignored columns in failed and finished data files.
128	continue
129	result[replacement] = row[key]
130	return result
131
132	def getMapping(self, path, headerfields, mode):
133	"""Get a mapping from CSV file headerfields to actually used fieldnames.
134
135	"""
136	result = dict()
137	reader = csv.reader(open(path, 'rb'))
138	raw_header = reader.next()
139	for num, field in enumerate(headerfields):
140	if field not in self.location_fields and mode == 'remove':
141	# Skip non-location fields when removing.
142	continue
143	if field == u'--IGNORE--':
144	# Skip ignored columns in failed and finished data files.
145	continue
146	result[raw_header[num]] = field
147	return result
148
149	def stringFromErrs(self, errors, inv_errors):
150	result = []
151	for err in errors:
152	fieldname, message = err
153	result.append("%s: %s" % (fieldname, message))
154	for err in inv_errors:
155	result.append("invariant: %s" % err)
156	return '; '.join(result)
157
158	def callFactory(self, args, *kw):
159	return createObject(self.factory_name)
160
161	def parentsExist(self, row, site):
162	"""Tell whether the parent object for data in ``row`` exists.
163	"""
164	raise NotImplementedError('method not implemented')
165
166	def entryExists(self, row, site):
167	"""Tell whether there already exists an entry for ``row`` data.
168	"""
169	raise NotImplementedError('method not implemented')
170
171	def getParent(self, row, site):
172	"""Get the parent object for the entry in ``row``.
173	"""
174	raise NotImplementedError('method not implemented')
175
176	def getEntry(self, row, site):
177	"""Get the parent object for the entry in ``row``.
178	"""
179	raise NotImplementedError('method not implemented')
180
181	def addEntry(self, obj, row, site):
182	"""Add the entry given given by ``row`` data.
183	"""
184	raise NotImplementedError('method not implemented')
185
186	def delEntry(self, row, site):
187	"""Delete entry given by ``row`` data.
188	"""
189	raise NotImplementedError('method not implemented')
190
191	def checkUpdateRequirements(self, obj, row, site):
192	"""Checks requirements the object must fulfill when being updated.
193
194	This method is not used in case of deleting or adding objects.
195
196	Returns error messages as strings in case of requirement
197	problems.
198	"""
199	return None
200
201	def updateEntry(self, obj, row, site):
202	"""Update obj to the values given in row.
203
204	Returns a string describing the fields changed.
205	"""
206	changed = []
207	for key, value in row.items():
208	# Skip fields to be ignored.
209	if value == IGNORE_MARKER:
210	continue
211	# Skip fields not declared in interface and which are
212	# not yet attributes of existing objects. We can thus not
213	# add non-existing attributes here.
214	if not hasattr(obj, key):
215	continue
216	setattr(obj, key, value)
217	log_value = getattr(value, 'code', value)
218	changed.append('%s=%s' % (key, log_value))
219
220	# If any catalog is involved it must be updated.
221	#
222	# XXX: The event is also triggered when creating objects as
223	# updateEntry is called also when creating entries resulting
224	# in objectAdded and additional objectModified events.
225	if len(changed):
226	notify(grok.ObjectModifiedEvent(obj))
227
228	return ', '.join(changed)
229
230	def createLogfile(self, path, fail_path, num, warnings, mode, user,
231	timedelta, logger=None):
232	"""Write to log file.
233	"""
234	if logger is None:
235	return
236	status = 'OK'
237	if warnings > 0:
238	status = 'FAILED'
239	logger.info("-" * 20)
240	logger.info("%s: Batch processing finished: %s" % (user, status))
241	logger.info("%s: Source: %s" % (user, path))
242	logger.info("%s: Mode: %s" % (user, mode))
243	logger.info("%s: User: %s" % (user, user))
244	if warnings > 0:
245	logger.info("%s: Failed datasets: %s" % (
246	user, os.path.basename(fail_path)))
247	logger.info("%s: Processing time: %0.3f s (%0.4f s/item)" % (
248	user, timedelta, timedelta/(num or 1)))
249	logger.info("%s: Processed: %s lines (%s successful/ %s failed)" % (
250	user, num, num - warnings, warnings
251	))
252	logger.info("-" * 20)
253	return
254
255	def writeFailedRow(self, writer, row, warnings):
256	"""Write a row with error messages to error CSV.
257
258	If warnings is a list of strings, they will be concatenated.
259	"""
260	error_col = warnings
261	if isinstance(warnings, list):
262	error_col = ' / '.join(warnings)
263	row['--ERRORS--'] = error_col
264	writer.writerow(row)
265	return
266
267	def checkConversion(self, row, mode='ignore', ignore_empty=True):
268	"""Validates all values in row.
269	"""
270	converter = IObjectConverter(self.iface)
271	errs, inv_errs, conv_dict = converter.fromStringDict(
272	row, self.factory_name, mode=mode)
273	return errs, inv_errs, conv_dict
274
275	def doImport(self, path, headerfields, mode='create', user='Unknown',
276	logger=None, ignore_empty=True):
277	"""Perform actual import.
278	"""
279	time_start = time.time()
280	self.checkHeaders(headerfields, mode)
281	mapping = self.getMapping(path, headerfields, mode)
282	reader = csv.DictReader(open(path, 'rb'))
283
284	temp_dir = tempfile.mkdtemp()
285
286	base = os.path.basename(path)
287	(base, ext) = os.path.splitext(base)
288	failed_path = os.path.join(temp_dir, "%s.pending%s" % (base, ext))
289	failed_headers = mapping.values()
290	failed_headers.append('--ERRORS--')
291	failed_writer = csv.DictWriter(open(failed_path, 'wb'),
292	failed_headers)
293	failed_writer.writerow(dict([(x,x) for x in failed_headers]))
294
295	finished_path = os.path.join(temp_dir, "%s.finished%s" % (base, ext))
296	finished_headers = mapping.values()
297	finished_writer = csv.DictWriter(open(finished_path, 'wb'),
298	finished_headers)
299	finished_writer.writerow(dict([(x,x) for x in finished_headers]))
300
301	num =0
302	num_warns = 0
303	site = grok.getSite()
304
305	for raw_row in reader:
306	num += 1
307	string_row = self.applyMapping(raw_row, mapping)
308	if ignore_empty and mode in ('update',):
309	# replace empty strings with ignore-markers
310	for key, val in string_row.items():
311	if val == '':
312	string_row[key] = IGNORE_MARKER
313	row = dict(string_row.items()) # create deep copy
314	errs, inv_errs, conv_dict = self.checkConversion(string_row, mode)
315	if errs or inv_errs:
316	num_warns += 1
317	conv_warnings = self.stringFromErrs(errs, inv_errs)
318	self.writeFailedRow(
319	failed_writer, string_row, conv_warnings)
320	continue
321	row.update(conv_dict)
322
323	if mode == 'create':
324	if not self.parentsExist(row, site):
325	num_warns += 1
326	self.writeFailedRow(
327	failed_writer, string_row,
328	"Not all parents do exist yet. Skipping")
329	continue
330	if self.entryExists(row, site):
331	num_warns += 1
332	self.writeFailedRow(
333	failed_writer, string_row,
334	"This object already exists. Skipping.")
335	continue
336	obj = self.callFactory()
337	# Override all values in row, also
338	# student_ids and applicant_ids which have been
339	# generated in the respective __init__ methods before.
340	self.updateEntry(obj, row, site)
341	try:
342	self.addEntry(obj, row, site)
343	except KeyError, error:
344	num_warns += 1
345	self.writeFailedRow(
346	failed_writer, string_row,
347	"%s Skipping." % error.message)
348	continue
349	elif mode == 'remove':
350	if not self.entryExists(row, site):
351	num_warns += 1
352	self.writeFailedRow(
353	failed_writer, string_row,
354	"Cannot remove: no such entry.")
355	continue
356	self.delEntry(row, site)
357	elif mode == 'update':
358	obj = self.getEntry(row, site)
359	if obj is None:
360	num_warns += 1
361	self.writeFailedRow(
362	failed_writer, string_row,
363	"Cannot update: no such entry.")
364	continue
365	update_errors = self.checkUpdateRequirements(obj, row, site)
366	if update_errors is not None:
367	num_warns += 1
368	self.writeFailedRow(
369	failed_writer, string_row, update_errors)
370	continue
371	self.updateEntry(obj, row, site)
372	finished_writer.writerow(string_row)
373
374	time_end = time.time()
375	timedelta = time_end - time_start
376
377	self.createLogfile(path, failed_path, num, num_warns, mode, user,
378	timedelta, logger=logger)
379	failed_path = os.path.abspath(failed_path)
380	if num_warns == 0:
381	del failed_writer
382	os.unlink(failed_path)
383	failed_path = None
384	return (num, num_warns,
385	os.path.abspath(finished_path), failed_path)
386
387	class ExporterBase(object):
388	"""A base for exporters.
389	"""
390	grok.implements(ICSVExporter)
391
392	#: Fieldnames considered by this exporter
393	fields = ('code', 'title', 'title_prefix')
394
395	#: The title under which this exporter will be displayed
396	#: (if registered as a utility)
397	title = 'Override this title'
398
399	def mangle_value(self, value, name, context=None):
400	"""Hook for mangling values in derived classes
401	"""
402	if isinstance(value, bool):
403	value = value and '1' or '0'
404	elif isinstance(value, unicode):
405	# CSV writers like byte streams better than unicode
406	value = value.encode('utf-8')
407	elif value is None:
408	# None is not really representable in CSV files
409	value = ''
410	return value
411
412	def get_csv_writer(self, filepath=None):
413	"""Get a CSV dict writer instance open for writing.
414
415	Returns a tuple (<writer>, <outfile>) where ``<writer>`` is a
416	:class:`csv.DictWriter` instance and outfile is the real file
417	which is written to. The latter is important when writing to
418	StringIO and can normally be ignored otherwise.
419
420	The returned file will already be filled with the header row.
421
422	Please note that if you give a filepath, the returned outfile
423	is open for writing only and you might have to close it before
424	reopening it for reading.
425	"""
426	if filepath is None:
427	outfile = StringIO()
428	else:
429	outfile = open(filepath, 'wb')
430	writer = csv.DictWriter(outfile, self.fields)
431	writer.writerow(dict(zip(self.fields, self.fields))) # header
432	return writer, outfile
433
434	def write_item(self, obj, writer):
435	"""Write a row extracted from `obj` into CSV file using `writer`.
436	"""
437	row = {}
438	for name in self.fields:
439	value = getattr(obj, name, None)
440	value = self.mangle_value(value, name, obj)
441	row[name] = value
442	writer.writerow(row)
443	return
444
445	def close_outfile(self, filepath, outfile):
446	"""Close outfile.
447
448	If filepath is None, the contents of outfile is returned.
449	"""
450	outfile.seek(0)
451	if filepath is None:
452	return outfile.read()
453	outfile.close()
454	return
455
456	def export(self, iterable, filepath=None):
457	"""Export `iterable` as CSV file.
458
459	If `filepath` is ``None``, a raw string with CSV data should
460	be returned.
461	"""
462	raise NotImplementedError
463
464	def export_all(self, site, filepath=None):
465	"""Export all appropriate objects in `site` into `filepath` as
466	CSV data.
467
468	If `filepath` is ``None``, a raw string with CSV data should
469	be returned.
470	"""
471	raise NotImplementedError

Note: See TracBrowser for help on using the repository browser.

Download in other formats: