Context navigation

source: main/waeup.kofa/trunk/src/waeup/kofa/utils/batching.py @ 8222

Last change on this file since 8222 was 8222, checked in by uli, 12 years ago
Fix doImport. Still needs tests. Use code in log messages.
Property svn:keywords set to `Id`
File size: 16.3 KB

Line
1	## $Id: batching.py 8222 2012-04-19 15:39:17Z uli $
2	##
3	## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4	## This program is free software; you can redistribute it and/or modify
5	## it under the terms of the GNU General Public License as published by
6	## the Free Software Foundation; either version 2 of the License, or
7	## (at your option) any later version.
8	##
9	## This program is distributed in the hope that it will be useful,
10	## but WITHOUT ANY WARRANTY; without even the implied warranty of
11	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	## GNU General Public License for more details.
13	##
14	## You should have received a copy of the GNU General Public License
15	## along with this program; if not, write to the Free Software
16	## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17	##
18	"""Kofa components for batch processing.
19
20	Batch processors eat CSV files to add, update or remove large numbers
21	of certain kinds of objects at once.
22	"""
23	import grok
24	import csv
25	import os
26	import tempfile
27	import time
28	from cStringIO import StringIO
29	from zope.component import createObject
30	from zope.interface import Interface
31	from zope.schema import getFields
32	from waeup.kofa.interfaces import (
33	IBatchProcessor, FatalCSVError, IObjectConverter,
34	ICSVExporter, DELETION_MARKER, IGNORE_MARKER)
35
36	class BatchProcessor(grok.GlobalUtility):
37	"""A processor to add, update, or remove data.
38
39	This is a non-active baseclass.
40	"""
41	grok.implements(IBatchProcessor)
42	grok.context(Interface)
43	grok.baseclass()
44
45	# Name used in pages and forms...
46	name = u'Non-registered base processor'
47
48	# Internal name...
49	util_name = 'baseprocessor'
50
51	# Items for this processor need an interface with zope.schema fields.
52	iface = Interface
53
54	# The name must be the same as the util_name attribute in order to
55	# register this utility correctly.
56	grok.name(util_name)
57
58	# Headers needed to locate items...
59	location_fields = ['code', 'faculty_code']
60
61	# A factory with this name must be registered...
62	factory_name = 'waeup.Department'
63
64	@property
65	def required_fields(self):
66	"""Required fields that have no default.
67
68	A list of names of field, whose value cannot be set if not
69	given during creation. Therefore these fields must exist in
70	input.
71
72	Fields with a default != missing_value do not belong to this
73	category.
74	"""
75	result = []
76	for key, field in getFields(self.iface).items():
77	if key in self.location_fields:
78	continue
79	if field.default is not field.missing_value:
80	continue
81	if field.required:
82	result.append(key)
83	return result
84
85	@property
86	def req(self):
87	result = dict(
88	create = self.location_fields + self.required_fields,
89	update = self.location_fields,
90	remove = self.location_fields,
91	)
92	return result
93
94	@property
95	def available_fields(self):
96	return sorted(list(set(
97	self.location_fields + getFields(self.iface).keys())))
98
99	def getHeaders(self, mode='create'):
100	return self.available_fields
101
102	def checkHeaders(self, headerfields, mode='create'):
103	req = self.req[mode]
104	# Check for required fields...
105	for field in req:
106	if not field in headerfields:
107	raise FatalCSVError(
108	"Need at least columns %s for import!" %
109	', '.join(["'%s'" % x for x in req]))
110	# Check for double fields. Cannot happen because this error is
111	# already catched in views
112	not_ignored_fields = [x for x in headerfields
113	if not x.startswith('--')]
114	if len(set(not_ignored_fields)) < len(not_ignored_fields):
115	raise FatalCSVError(
116	"Double headers: each column name may only appear once.")
117	return True
118
119	def applyMapping(self, row, mapping):
120	"""Apply mapping to a row of CSV data.
121
122	"""
123	result = dict()
124	for key, replacement in mapping.items():
125	if replacement == u'--IGNORE--':
126	# Skip ignored columns in failed and finished data files.
127	continue
128	result[replacement] = row[key]
129	return result
130
131	def getMapping(self, path, headerfields, mode):
132	"""Get a mapping from CSV file headerfields to actually used fieldnames.
133
134	"""
135	result = dict()
136	reader = csv.reader(open(path, 'rb'))
137	raw_header = reader.next()
138	for num, field in enumerate(headerfields):
139	if field not in self.location_fields and mode == 'remove':
140	# Skip non-location fields when removing.
141	continue
142	if field == u'--IGNORE--':
143	# Skip ignored columns in failed and finished data files.
144	continue
145	result[raw_header[num]] = field
146	return result
147
148	def stringFromErrs(self, errors, inv_errors):
149	result = []
150	for err in errors:
151	fieldname, message = err
152	result.append("%s: %s" % (fieldname, message))
153	for err in inv_errors:
154	result.append("invariant: %s" % err)
155	return '; '.join(result)
156
157	def callFactory(self, args, *kw):
158	return createObject(self.factory_name)
159
160	def parentsExist(self, row, site):
161	"""Tell whether the parent object for data in ``row`` exists.
162	"""
163	raise NotImplementedError('method not implemented')
164
165	def entryExists(self, row, site):
166	"""Tell whether there already exists an entry for ``row`` data.
167	"""
168	raise NotImplementedError('method not implemented')
169
170	def getParent(self, row, site):
171	"""Get the parent object for the entry in ``row``.
172	"""
173	raise NotImplementedError('method not implemented')
174
175	def getEntry(self, row, site):
176	"""Get the parent object for the entry in ``row``.
177	"""
178	raise NotImplementedError('method not implemented')
179
180	def addEntry(self, obj, row, site):
181	"""Add the entry given given by ``row`` data.
182	"""
183	raise NotImplementedError('method not implemented')
184
185	def delEntry(self, row, site):
186	"""Delete entry given by ``row`` data.
187	"""
188	raise NotImplementedError('method not implemented')
189
190	def checkUpdateRequirements(self, obj, row, site):
191	"""Checks requirements the object must fulfill when being updated.
192
193	This method is not used in case of deleting or adding objects.
194
195	Returns error messages as strings in case of requirement
196	problems.
197	"""
198	return None
199
200	def updateEntry(self, obj, row, site):
201	"""Update obj to the values given in row.
202
203	Returns a string describing the fields changed.
204	"""
205	changed = []
206	for key, value in row.items():
207	# Skip fields to be ignored.
208	if value == IGNORE_MARKER:
209	continue
210	# Skip fields not declared in interface.
211	if not hasattr(obj, key):
212	continue
213	setattr(obj, key, value)
214	log_value = getattr(value, 'code', value)
215	changed.append('%s=%s' % (key, log_value))
216	return ', '.join(changed)
217
218	def createLogfile(self, path, fail_path, num, warnings, mode, user,
219	timedelta, logger=None):
220	"""Write to log file.
221	"""
222	if logger is None:
223	return
224	status = 'OK'
225	if warnings > 0:
226	status = 'FAILED'
227	logger.info("-" * 20)
228	logger.info("%s: Batch processing finished: %s" % (user, status))
229	logger.info("%s: Source: %s" % (user, path))
230	logger.info("%s: Mode: %s" % (user, mode))
231	logger.info("%s: User: %s" % (user, user))
232	if warnings > 0:
233	logger.info("%s: Failed datasets: %s" % (
234	user, os.path.basename(fail_path)))
235	logger.info("%s: Processing time: %0.3f s (%0.4f s/item)" % (
236	user, timedelta, timedelta/(num or 1)))
237	logger.info("%s: Processed: %s lines (%s successful/ %s failed)" % (
238	user, num, num - warnings, warnings
239	))
240	logger.info("-" * 20)
241	return
242
243	def writeFailedRow(self, writer, row, warnings):
244	"""Write a row with error messages to error CSV.
245
246	If warnings is a list of strings, they will be concatenated.
247	"""
248	error_col = warnings
249	if isinstance(warnings, list):
250	error_col = ' / '.join(warnings)
251	row['--ERRORS--'] = error_col
252	writer.writerow(row)
253	return
254
255	def checkConversion(self, row, mode='ignore', ignore_empty=True):
256	"""Validates all values in row.
257	"""
258	converter = IObjectConverter(self.iface)
259	errs, inv_errs, conv_dict = converter.fromStringDict(
260	row, self.factory_name, mode=mode)
261	return errs, inv_errs, conv_dict
262
263	def doImport(self, path, headerfields, mode='create', user='Unknown',
264	logger=None, ignore_empty=True):
265	"""Perform actual import.
266	"""
267	time_start = time.time()
268	self.checkHeaders(headerfields, mode)
269	mapping = self.getMapping(path, headerfields, mode)
270	reader = csv.DictReader(open(path, 'rb'))
271
272	temp_dir = tempfile.mkdtemp()
273
274	base = os.path.basename(path)
275	(base, ext) = os.path.splitext(base)
276	failed_path = os.path.join(temp_dir, "%s.pending%s" % (base, ext))
277	failed_headers = mapping.values()
278	failed_headers.append('--ERRORS--')
279	failed_writer = csv.DictWriter(open(failed_path, 'wb'),
280	failed_headers)
281	failed_writer.writerow(dict([(x,x) for x in failed_headers]))
282
283	finished_path = os.path.join(temp_dir, "%s.finished%s" % (base, ext))
284	finished_headers = mapping.values()
285	finished_writer = csv.DictWriter(open(finished_path, 'wb'),
286	finished_headers)
287	finished_writer.writerow(dict([(x,x) for x in finished_headers]))
288
289	num =0
290	num_warns = 0
291	site = grok.getSite()
292
293	for raw_row in reader:
294	num += 1
295	string_row = self.applyMapping(raw_row, mapping)
296	if ignore_empty and mode in ('update',):
297	# replace empty strings with ignore-markers
298	for key, val in string_row.items():
299	if val == '':
300	string_row[key] = IGNORE_MARKER
301	row = dict(string_row.items()) # create deep copy
302	errs, inv_errs, conv_dict = self.checkConversion(string_row, mode)
303	if errs or inv_errs:
304	num_warns += 1
305	conv_warnings = self.stringFromErrs(errs, inv_errs)
306	self.writeFailedRow(
307	failed_writer, string_row, conv_warnings)
308	continue
309	row.update(conv_dict)
310
311	if mode == 'create':
312	if not self.parentsExist(row, site):
313	num_warns += 1
314	self.writeFailedRow(
315	failed_writer, string_row,
316	"Not all parents do exist yet. Skipping")
317	continue
318	if self.entryExists(row, site):
319	num_warns += 1
320	self.writeFailedRow(
321	failed_writer, string_row,
322	"This object already exists in the same container. "
323	"Skipping.")
324	continue
325	obj = self.callFactory()
326	# Override all values in row, also
327	# student_ids and applicant_ids which have been
328	# generated in the respective __init__ methods before.
329	self.updateEntry(obj, row, site)
330	try:
331	self.addEntry(obj, row, site)
332	except KeyError, error:
333	num_warns += 1
334	self.writeFailedRow(
335	failed_writer, string_row,
336	"%s Skipping." % error.message)
337	continue
338	elif mode == 'remove':
339	if not self.entryExists(row, site):
340	num_warns += 1
341	self.writeFailedRow(
342	failed_writer, string_row,
343	"Cannot remove: no such entry.")
344	continue
345	self.delEntry(row, site)
346	elif mode == 'update':
347	obj = self.getEntry(row, site)
348	if obj is None:
349	num_warns += 1
350	self.writeFailedRow(
351	failed_writer, string_row,
352	"Cannot update: no such entry.")
353	continue
354	update_errors = self.checkUpdateRequirements(obj, row, site)
355	if update_errors is not None:
356	num_warns += 1
357	self.writeFailedRow(
358	failed_writer, string_row, update_errors)
359	continue
360	self.updateEntry(obj, row, site)
361	finished_writer.writerow(string_row)
362
363	time_end = time.time()
364	timedelta = time_end - time_start
365
366	self.createLogfile(path, failed_path, num, num_warns, mode, user,
367	timedelta, logger=logger)
368	failed_path = os.path.abspath(failed_path)
369	if num_warns == 0:
370	del failed_writer
371	os.unlink(failed_path)
372	failed_path = None
373	return (num, num_warns,
374	os.path.abspath(finished_path), failed_path)
375
376	class ExporterBase(object):
377	"""A base for exporters.
378	"""
379	grok.implements(ICSVExporter)
380
381	#: Fieldnames considered by this exporter
382	fields = ('code', 'title', 'title_prefix')
383
384	#: The title under which this exporter will be displayed
385	#: (if registered as a utility)
386	title = 'Override this title'
387
388	def mangle_value(self, value, name, context=None):
389	"""Hook for mangling values in derived classes
390	"""
391	if isinstance(value, bool):
392	value = value and '1' or '0'
393	elif isinstance(value, unicode):
394	# CSV writers like byte streams better than unicode
395	value = value.encode('utf-8')
396	elif value is None:
397	# None is not really representable in CSV files
398	value = ''
399	return value
400
401	def get_csv_writer(self, filepath=None):
402	"""Get a CSV dict writer instance open for writing.
403
404	Returns a tuple (<writer>, <outfile>) where ``<writer>`` is a
405	:class:`csv.DictWriter` instance and outfile is the real file
406	which is written to. The latter is important when writing to
407	StringIO and can normally be ignored otherwise.
408
409	The returned file will already be filled with the header row.
410
411	Please note that if you give a filepath, the returned outfile
412	is open for writing only and you might have to close it before
413	reopening it for reading.
414	"""
415	if filepath is None:
416	outfile = StringIO()
417	else:
418	outfile = open(filepath, 'wb')
419	writer = csv.DictWriter(outfile, self.fields)
420	writer.writerow(dict(zip(self.fields, self.fields))) # header
421	return writer, outfile
422
423	def write_item(self, obj, writer):
424	"""Write a row extracted from `obj` into CSV file using `writer`.
425	"""
426	row = {}
427	for name in self.fields:
428	value = getattr(obj, name, None)
429	value = self.mangle_value(value, name, obj)
430	row[name] = value
431	writer.writerow(row)
432	return
433
434	def close_outfile(self, filepath, outfile):
435	"""Close outfile.
436
437	If filepath is None, the contents of outfile is returned.
438	"""
439	outfile.seek(0)
440	if filepath is None:
441	return outfile.read()
442	outfile.close()
443	return
444
445	def export(self, iterable, filepath=None):
446	"""Export `iterable` as CSV file.
447
448	If `filepath` is ``None``, a raw string with CSV data should
449	be returned.
450	"""
451	raise NotImplementedError
452
453	def export_all(self, site, filepath=None):
454	"""Export all appropriate objects in `site` into `filepath` as
455	CSV data.
456
457	If `filepath` is ``None``, a raw string with CSV data should
458	be returned.
459	"""
460	raise NotImplementedError

Note: See TracBrowser for help on using the repository browser.

Download in other formats: